dotNet使用HttpWebRequest模拟浏览器

2015-07-31 08:01:25来源:cnblogs.com作者:Carey Tzou人点击

在编写网络爬虫时,HttpWebRequest几乎可以完成绝大多数网站的抓取,为了更好的使用这一技术,我将常用的几个功能进行了封装,以方便调用。这个类已经在多个项目中得到使用,主要解决了Cookies相关的一些问题;如果有其它方面的问题可以提出来,我会进一步完善。

目前HttpHelper包含了以下几个方面:

  • GetHttpContent:通过Get或Post来获取网页的Html
  • SetCookie:根据response中头部的set-cookie对cookie进行设置,能识别httponly
  • GetAllCookies:将CookieContainer转换为键值对,方便存储和跨程序间调用
  • ConvertToCookieContainer:将键值对转换回CookieContainer供程序调用
  • BuildPostData:通过一个需要post的html构建出postdata

代码如下:

  1 using System;  2 using System.Collections.Generic;  3 using System.Collections.Specialized;  4 using System.IO;  5 using System.IO.Compression;  6 using System.Linq;  7 using System.Net;  8 using System.Net.Security;  9 using System.Security.Cryptography.X509Certificates; 10 using System.Text; 11 using System.Text.RegularExpressions; 12 using System.Collections; 13 using HtmlAgilityPack; 14  15 namespace TNIdea.Common.Helper 16 { 17     public class HttpHelper 18     { 19         public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^/s""'>]+)""?)|(xml.*?encoding=""?(?<Charset>[^/s"">]+)""?)"; 20  21         /// <summary> 22         /// 获取网页的内容 23         /// </summary> 24         /// <param name="url">Url</param> 25         /// <param name="postData">Post的信息</param> 26         /// <param name="cookies">Cookies</param> 27         /// <param name="userAgent">浏览器标识</param> 28         /// <param name="referer">来源页</param> 29         /// <param name="cookiesDomain">Cookies的Domian参数,配合cookies使用;为空则取url的Host</param> 30         /// <param name="encode">编码方式,用于解析html</param> 31         /// <returns></returns> 32         public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null) 33         { 34             try 35             { 36                 HttpWebResponse httpResponse = null; 37                 if (!string.IsNullOrWhiteSpace(postData)) 38                     httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer); 39                 else 40                     httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer); 41  42                 #region 根据Html头判断 43                 string Content = null; 44                 //缓冲区长度 45                 const int N_CacheLength = 10000; 46                 //头部预读取缓冲区,字节形式 47                 var bytes = new List<byte>(); 48                 int count = 0; 49                 //头部预读取缓冲区,字符串 50                 String cache = string.Empty; 51  52                 //创建流对象并解码 53                 Stream ResponseStream; 54                 switch (httpResponse.ContentEncoding.ToUpperInvariant()) 55                 { 56                     case "GZIP": 57                         ResponseStream = new GZipStream( 58                             httpResponse.GetResponseStream(), CompressionMode.Decompress); 59                         break; 60                     case "DEFLATE": 61                         ResponseStream = new DeflateStream( 62                             httpResponse.GetResponseStream(), CompressionMode.Decompress); 63                         break; 64                     default: 65                         ResponseStream = httpResponse.GetResponseStream(); 66                         break; 67                 } 68  69                 try 70                 { 71                     while ( 72                         !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase) 73                           || count >= N_CacheLength)) 74                     { 75                         var b = (byte)ResponseStream.ReadByte(); 76                         if (b < 0) //end of stream 77                         { 78                             break; 79                         } 80                         bytes.Add(b); 81  82                         count++; 83                         cache += (char)b; 84                     } 85  86  87                     if (encode == null) 88                     { 89                         try 90                         { 91                             if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn") 92                             { 93                                 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline); 94                                 if (match.Success) 95                                 { 96                                     try 97                                     { 98                                         string charset = match.Groups["Charset"].Value; 99                                         encode = Encoding.GetEncoding(charset);100                                     }101                                     catch { }102                                 }103                                 else104                                     encode = Encoding.GetEncoding("GB2312");105                             }106                             else107                                 encode = Encoding.GetEncoding(httpResponse.CharacterSet);108                         }109                         catch { }110                     }111 112                     //缓冲字节重新编码,然后再把流读完113                     var Reader = new StreamReader(ResponseStream, encode);114                     Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd();115                     Reader.Close();116                 }117                 catch (Exception ex)118                 {119                     return ex.ToString();120                 }121                 finally122                 {123                     httpResponse.Close();124                 }125                 #endregion 根据Html头判断126 127                 //获取返回的Cookies,支持httponly128                 if (string.IsNullOrWhiteSpace(cookiesDomain))129                     cookiesDomain = httpResponse.ResponseUri.Host;130 131                 cookies = new CookieContainer();132                 CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);133                 cookies.Add(httpHeaderCookies ?? httpResponse.Cookies);134 135                 return Content;136             }137             catch138             {139                 return string.Empty;140             }141         }142 143 144         /// <summary>145         /// 创建GET方式的HTTP请求 146         /// </summary>147         /// <param name="url"></param>148         /// <param name="timeout"></param>149         /// <param name="userAgent"></param>150         /// <param name="cookies"></param>151         /// <param name="referer"></param>152         /// <returns></returns>153         public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")154         {155             HttpWebRequest request = null;156             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))157             {158                 //对服务端证书进行有效性校验(非第三方权威机构颁发的证书,如自己生成的,不进行验证,这里返回true)159                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);160                 request = WebRequest.Create(url) as HttpWebRequest;161                 //request.ProtocolVersion = HttpVersion.Version10;    //http版本,默认是1.1,这里设置为1.0162             }163             else164             {165                 request = WebRequest.Create(url) as HttpWebRequest;166             }167 168             request.Referer = referer;169             request.Method = "GET";170 171             //设置代理UserAgent和超时172             if (string.IsNullOrWhiteSpace(userAgent))173                 userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36";174 175             request.UserAgent = userAgent;176             request.Timeout = timeout;177             request.KeepAlive = true;178             request.AllowAutoRedirect = true;179 180             if (cookies == null)181                 cookies = new CookieContainer();182             request.CookieContainer = cookies;183 184             return request.GetResponse() as HttpWebResponse;185         }186 187         /// <summary>188         /// 创建POST方式的HTTP请求189         /// </summary>190         /// <param name="url"></param>191         /// <param name="postData"></param>192         /// <param name="timeout"></param>193         /// <param name="userAgent"></param>194         /// <param name="cookies"></param>195         /// <param name="referer"></param>196         /// <returns></returns>197         public static HttpWebResponse CreatePostHttpResponse(string url, string postData, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")198         {199             HttpWebRequest request = null;200             //如果是发送HTTPS请求  201             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))202             {203                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);204                 request = WebRequest.Create(url) as HttpWebRequest;205                 //request.ProtocolVersion = HttpVersion.Version10;206             }207             else208             {209                 request = WebRequest.Create(url) as HttpWebRequest;210             }211             request.Referer = referer;212             request.Method = "POST";213             request.ContentType = "application/x-www-form-urlencoded";214 215             //设置代理UserAgent和超时216             if (string.IsNullOrWhiteSpace(userAgent))217                 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36";218             else219                 request.UserAgent = userAgent;220             request.Timeout = timeout;221             request.KeepAlive = true;222             request.AllowAutoRedirect = true;223 224             if (cookies == null)225                 cookies = new CookieContainer();226             request.CookieContainer = cookies;227 228             //发送POST数据  229             if (!string.IsNullOrWhiteSpace(postData))230             {231                 byte[] data = Encoding.UTF8.GetBytes(postData);232                 request.ContentLength = data.Length;233                 using (Stream stream = request.GetRequestStream())234                 {235                     stream.Write(data, 0, data.Length);236                 }237             }238             //string[] values = request.Headers.GetValues("Content-Type");239             return request.GetResponse() as HttpWebResponse;240         }241 242         /// <summary>243         /// 验证证书244         /// </summary>245         /// <param name="sender"></param>246         /// <param name="certificate"></param>247         /// <param name="chain"></param>248         /// <param name="errors"></param>249         /// <returns>是否验证通过</returns>250         private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)251         {252             if (errors == SslPolicyErrors.None)253                 return true;254             return false;255         }256 257         /// <summary>258         /// 根据response中头部的set-cookie对request中的cookie进行设置259         /// </summary>260         /// <param name="setCookie">The set cookie.</param>261         /// <param name="defaultDomain">The default domain.</param>262         /// <returns></returns>263         private static CookieCollection SetCookie(HttpWebResponse response, string defaultDomain)264         {265             try266             {267                 string[] setCookie = response.Headers.GetValues("Set-Cookie");268 269                 // there is bug in it,the datetime in "set-cookie" will be sepreated in two pieces.270                 List<string> a = new List<string>(setCookie);271                 for (int i = setCookie.Length - 1; i > 0; i--)272                 {273                     if (a[i].Substring(a[i].Length - 3) == "GMT")274                     {275                         a[i - 1] = a[i - 1] + ", " + a[i];276                         a.RemoveAt(i);277                         i--;278                     }279                 }280                 setCookie = a.ToArray<string>();281                 CookieCollection cookies = new CookieCollection();282                 foreach (string str in setCookie)283                 {284                     NameValueCollection hs = new NameValueCollection();285                     foreach (string i in str.Split(';'))286                     {287                         int index = i.IndexOf("=");288                         if (index > 0)289                             hs.Add(i.Substring(0, index).Trim(), i.Substring(index + 1).Trim());290                         else291                             switch (i)292                             {293                                 case "HttpOnly":294                                     hs.Add("HttpOnly", "True");295                                     break;296                                 case "Secure":297                                     hs.Add("Secure", "True");298                                     break;299                             }300                     }301                     Cookie ck = new Cookie();302                     foreach (string Key in hs.AllKeys)303                     {304                         switch (Key.ToLower().Trim())305                         {306                             case "path":307                                 ck.Path = hs[Key];308                                 break;309                             case "expires":310                                 ck.Expires = DateTime.Parse(hs[Key]);311                                 break;312                             case "domain":313                                 ck.Domain = hs[Key];314                                 break;315                             case "httpOnly":316                                 ck.HttpOnly = true;317                                 break;318                             case "secure":319                                 ck.Secure = true;320                                 break;321                             default:322                                 ck.Name = Key;323                                 ck.Value = hs[Key];324                                 break;325                         }326                     }327                     if (ck.Domain == "") ck.Domain = defaultDomain;328                     if (ck.Name != "") cookies.Add(ck);329                 }330                 return cookies;331             }332             catch333             {334                 return null;335             }336         }337 338         /// <summary>339         /// 遍历CookieContainer340         /// </summary>341         /// <param name="cookieContainer"></param>342         /// <returns>List of cookie</returns>343         public static Dictionary<string, string> GetAllCookies(CookieContainer cookieContainer)344         {345             Dictionary<string, string> cookies = new Dictionary<string, string>();346 347             Hashtable table = (Hashtable)cookieContainer.GetType().InvokeMember("m_domainTable",348                 System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField |349                 System.Reflection.BindingFlags.Instance, null, cookieContainer, new object[] { });350 351             foreach (string pathList in table.Keys)352             {353                 StringBuilder _cookie = new StringBuilder();354                 SortedList cookieColList = (SortedList)table[pathList].GetType().InvokeMember("m_list",355                     System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField356                     | System.Reflection.BindingFlags.Instance, null, table[pathList], new object[] { });357                 foreach (CookieCollection colCookies in cookieColList.Values)358                     foreach (Cookie c in colCookies)359                         _cookie.Append(c.Name + "=" + c.Value + ";");360 361                 cookies.Add(pathList, _cookie.ToString().TrimEnd(';'));362             }363             return cookies;364         }365 366         /// <summary>367         /// convert cookies string to CookieContainer368         /// </summary>369         /// <param name="cookies"></param>370         /// <returns></returns>371         public static CookieContainer ConvertToCookieContainer(Dictionary<string, string> cookies)372         {373             CookieContainer cookieContainer = new CookieContainer();374 375             foreach (var cookie in cookies)376             {377                 string[] strEachCookParts = cookie.Value.Split(';');378                 int intEachCookPartsCount = strEachCookParts.Length;379 380                 foreach (string strCNameAndCValue in strEachCookParts)381                 {382                     if (!string.IsNullOrEmpty(strCNameAndCValue))383                     {384                         Cookie cookTemp = new Cookie();385                         int firstEqual = strCNameAndCValue.IndexOf("=");386                         string firstName = strCNameAndCValue.Substring(0, firstEqual);387                         string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));388                         cookTemp.Name = firstName;389                         cookTemp.Value = allValue;390                         cookTemp.Path = "/";391                         cookTemp.Domain = cookie.Key;392                         cookieContainer.Add(cookTemp);393                     }394                 }395             }396             return cookieContainer;397         }398 399         public static string BuildPostData(string htmlContent)400         {401             HtmlDocument htmlDoc = new HtmlDocument();402             htmlDoc.LoadHtml(htmlContent);403             //Get the form node collection.404             HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//form");405             HtmlNodeCollection htmlInputs = htmlNode.SelectNodes("//input");406 407             StringBuilder postData = new StringBuilder();408 409             foreach (HtmlNode input in htmlInputs)410             {411                 if(input.Attributes["value"] != null)412                     postData.Append(input.Attributes["name"].Value + "=" + input.Attributes["value"].Value + "&");413             }414             return postData.ToString().TrimEnd('&');415         }416     }417 }

部分网站需要登录的问题我已经着手通过另一个项目来解决(imitate-login),目前还有许多网页使用了JavaScript或各种基于JS的框架来对网页进行数据加载,如何来模拟执行JavaScript暂时还没找到比较优美的解决方案,如果大家有什么好的方案可以发给我,谢谢!

 未经授权,拒绝任何全文及摘要转载!

最新文章

123

最新摄影

微信扫一扫

第七城市微信公众平台