声明:此正则表达式只适用于.net ,使用的流程为发送http请求返回整个html网页,然后从此html页面抓取想要的数据。
第一部分:发送httpWebRequest 请求
C#代码
//url 地址
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("URL"));
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//浏览器类型设置
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)";
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("UTF-8"));
//返回的html网页数据
String htmlStr = reader.ReadToEnd();第二部分:根据返回的html获取有用数据,此方法适用于所有想通过ID或Class等等的标签找到html的需求,拿下面一个方法为例
C#代码
/// <summary>
/// 获得颜色
/// </summary>
/// <param name="htmlStr"></param>
/// <returns></returns>
public String getColor(String htmlStr)
{
//获取class为 DetailsC_Sku的html ,还可改为ID的方式
//string regstr6 = @"<(?<HtmlTag>[w]+)[^>]*s[iI][dD]=(?<Quote>";
string regstr6 = @"<(?<HtmlTag>[w]+)[^>]*s[cC][lL][aA][sS][sS]=(?<Quote>";
string regstr7 = "["']?)DetailsC_Sku(?(Quote)";
string regstr8 = @"k<Quote>)";
string regstr9 = "["']?[^>]*>";
string regstr10 = @"((?<Nested><k<HtmlTag>[^>]*>)|</k<HtmlTag>>(?<-Nested>)|.*?)*</k<HtmlTag>>";
StringBuilder sb2 = new StringBuilder();
sb2.Append(regstr6);
sb2.Append(regstr7);
sb2.Append(regstr8);
sb2.Append(regstr9);
sb2.Append(regstr10);
//根据正则表达式获取的html
String sizeHtml = Regex.Match(htmlStr, sb2.ToString(), RegexOptions.Singleline).ToString();
if (!String.IsNullOrEmpty(sizeHtml))
{
String newhtml = htmlStr.Replace(sizeHtml, "");
string regstr11 = @"<(?<HtmlTag>[w]+)[^>]*s[cC][lL][aA][sS][sS]=(?<Quote>";
string regstr12 = "["']?)DetailsC_Sku(?(Quote)";
string regstr13 = @"k<Quote>)";
string regstr14 = "["']?[^>]*>";
string regstr15 = @"((?<Nested><k<HtmlTag>[^>]*>)|</k<HtmlTag>>(?<-Nested>)|.*?)*</k<HtmlTag>>";
StringBuilder sb3 = new StringBuilder();
sb3.Append(regstr11);
sb3.Append(regstr12);
sb3.Append(regstr13);
sb3.Append(regstr14);
sb3.Append(regstr15);
String colorHtml = Regex.Match(newhtml, sb3.ToString(), RegexOptions.Singleline).ToString();
if (String.IsNullOrEmpty(colorHtml))
return "";
//找出此colorHtml中的所有a 标签
Regex regex2 = new Regex(@"<a.*?>[sS]*?</a>");
MatchCollection mc2 = regex2.Matches(colorHtml);
StringBuilder sbs = new StringBuilder();
//循环找到颜色
if (mc2.Count > 0)
{
foreach (Match mm in mc2)
{
sbs.Append(RemoveHtml(mm.Value.ToString())).Append(",");
}
}
return sbs.ToString();
}
return "";
}
C#代码
/// <summary>
/// 替换字符串中的html标签为空返回标签里的内容
/// </summary>
/// <param name="src"></param>
/// <returns></returns>
public string RemoveHtml(string src)
{
Regex htmlReg = new Regex(@"<[^>]+>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
Regex htmlSpaceReg = new Regex("\ \;", RegexOptions.Compiled | RegexOptions.IgnoreCase);
Regex spaceReg = new Regex("\s{2,}|\ \;", RegexOptions.Compiled | RegexOptions.IgnoreCase);
Regex styleReg = new Regex(@"<style(.*?)</style>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
Regex scriptReg = new Regex(@"<script(.*?)</script>", RegexOptions.Compiled | RegexOptions.IgnoreCase);
src = styleReg.Replace(src, string.Empty);
src = scriptReg.Replace(src, string.Empty);
src = htmlReg.Replace(src, string.Empty);
src = htmlSpaceReg.Replace(src, " ");
src = spaceReg.Replace(src, " ");
return src.Trim();
}
每个人都需要一台速度更快、更稳定的 PC。随着时间的推移,垃圾文件、旧注册表数据和不必要的后台进程会占用资源并降低性能。幸运的是,许多工具可以让 Windows 保持平稳运行。
Copyright 2014-2025 https://www.php.cn/ All Rights Reserved | php.cn | 湘ICP备2023035733号