提供一段程式,可以透過 url 去將 url 所指定的網頁將網頁抓回並存成字串。
可以針對這回傳字串進行文字分析處理以取得我們想要的資訊。
#region Retrieves the HTML from the specified URL.
/// <summary>
/// Retrieves the HTML from the specified URL.
/// </summary>
/// <param name="pageUrl">URL of the web page to retrive HTML.</param>
/// <param name="timeoutSeconds">The timeout for the http request.</param>
/// <returns>Returns the retrived HTML.</returns>
public string GetPageHTML(string pageUrl, int timeoutSeconds)
{
System.Net.WebResponse response = null;
try
{
// Setup our Web request
System.Net.WebRequest request = System.Net.WebRequest.Create(pageUrl);
request.Timeout = timeoutSeconds * 1000;
// Retrieve data from request
response = request.GetResponse();
System.IO.Stream streamReceive = response.GetResponseStream();
System.Text.Encoding encoding = System.Text.Encoding.GetEncoding("utf-8");
//System.Text.Encoding encoding = System.Text.Encoding.GetEncoding("big5");
//System.IO.StreamReader streamRead = new System.IO.StreamReader(streamReceive, System.Text.Encoding.Default);
System.IO.StreamReader streamRead = new System.IO.StreamReader(streamReceive, System.Text.Encoding.UTF8);
// return the retrieved HTML
return streamRead.ReadToEnd();
}
catch (Exception ex)
{
// Error occured grabbing data, return empty string.
//MessageBox.Show(this, "An error occurred while retrived the HTML content. " + ex.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
return "";
}
finally
{
// Check if exists, then close the response.
if (response != null)
{
response.Close();
}
}
}
#endregion
// for test
protected void btnGet_Click(object sender, EventArgs e)
{
string AA=GetPageHTML(txtURL.Text.Trim(), 100);
txtResult.Text = AA;
}
streamReceive
GetResponseStream
encoding
streamRead
參考:透過 HTML Agility Pack 抓網頁資料
沒有留言:
張貼留言