提供一段程式,可以透過 url 去將 url 所指定的網頁將網頁抓回並存成字串。
可以針對這回傳字串進行文字分析處理以取得我們想要的資訊。
#region Retrieves the HTML from the specified URL. /// <summary> /// Retrieves the HTML from the specified URL. /// </summary> /// <param name="pageUrl">URL of the web page to retrive HTML.</param> /// <param name="timeoutSeconds">The timeout for the http request.</param> /// <returns>Returns the retrived HTML.</returns> public string GetPageHTML(string pageUrl, int timeoutSeconds) { System.Net.WebResponse response = null; try { // Setup our Web request System.Net.WebRequest request = System.Net.WebRequest.Create(pageUrl); request.Timeout = timeoutSeconds * 1000; // Retrieve data from request response = request.GetResponse(); System.IO.Stream streamReceive = response.GetResponseStream(); System.Text.Encoding encoding = System.Text.Encoding.GetEncoding("utf-8"); //System.Text.Encoding encoding = System.Text.Encoding.GetEncoding("big5"); //System.IO.StreamReader streamRead = new System.IO.StreamReader(streamReceive, System.Text.Encoding.Default); System.IO.StreamReader streamRead = new System.IO.StreamReader(streamReceive, System.Text.Encoding.UTF8); // return the retrieved HTML return streamRead.ReadToEnd(); } catch (Exception ex) { // Error occured grabbing data, return empty string. //MessageBox.Show(this, "An error occurred while retrived the HTML content. " + ex.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); return ""; } finally { // Check if exists, then close the response. if (response != null) { response.Close(); } } } #endregion // for test protected void btnGet_Click(object sender, EventArgs e) { string AA=GetPageHTML(txtURL.Text.Trim(), 100); txtResult.Text = AA; }
streamReceive
GetResponseStream
encoding
streamRead
參考:透過 HTML Agility Pack 抓網頁資料
沒有留言:
張貼留言