博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
抓取html 写正则
阅读量:6575 次
发布时间:2019-06-24

本文共 4356 字,大约阅读时间需要 14 分钟。

using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Net;using System.IO;using System.IO.Compression;using System.Text.RegularExpressions;namespace WikiPageCreater.Common{    public class PageHelper    {        ///         /// 根据 url 获取网页编码        ///         ///         /// 
public static string GetEncoding(string url) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); else reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?
[^""]*)"); if (reg_charset.IsMatch(html)) { return reg_charset.Match(html).Groups["charset"].Value; } else if (response.CharacterSet != string.Empty) { return response.CharacterSet; } else return Encoding.Default.BodyName; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return Encoding.Default.BodyName; } ///
/// 根据 url 和 encoding 获取当前url页面的 html 源代码 /// ///
///
///
public static string GetHtml(string url, Encoding encoding) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding); else reader = new StreamReader(response.GetResponseStream(), encoding); string html = reader.ReadToEnd(); return html; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return string.Empty; } }}

抓取后

 

 

Regex regex = new Regex("
.*?)\".*?title=\"(?
.*?)\".*?>", RegexOptions.Compiled);

可以取到href和title

 

Regex reg = new Regex(@"(?is)
]*?scr=(['""\s]?)([^'""\s]+)\1[^>]*?>");抓取图片src

 

 

 

Regex regexObj = new Regex("
  • .+?)\".*?>
  • .*?
  • .*?
    .+?)\".*?>(?
    .*?)</a></li>", RegexOptions.Singleline);可以获取图片src,title,href,其它信息可以类推,一次截取出来
  •  

     

    转载地址:http://ybgjo.baihongyu.com/

    你可能感兴趣的文章
    odoo开发笔记 -- 翻译机制及导入.po文件
    查看>>
    运维邮件
    查看>>
    Sql异常①
    查看>>
    横向无缝滚动
    查看>>
    PreparedStatement设置时间
    查看>>
    CF533C:Board Game(博弈)
    查看>>
    HDU5389:Zero Escape(dp & 类背包)
    查看>>
    jQ常见数组问题
    查看>>
    SEO优化:WordPress发布文章主动推送到百度,加快收录保护原创
    查看>>
    小学期学习总结一
    查看>>
    ScrollGridView 标题不变化 内容变化
    查看>>
    LeetCode - 16. 3Sum Closest
    查看>>
    LeetCode - 7. Reverse Integer
    查看>>
    MFC下运行控制台不显示黑屏
    查看>>
    算法练习——聪明的情侣
    查看>>
    Java多线程系列 面试题
    查看>>
    AOP jdk动态代理
    查看>>
    windows常用操作
    查看>>
    NYOJ-85 有趣的数 AC 分类: NYOJ ...
    查看>>
    (一)linux下hadoop安装配置
    查看>>