C#实战分享--爬虫的基础原理及实现_网站优化分享

网终爬虫基本原理可以概括为以下几个步骤：

建立待爬取的URL队列。爬虫首先需要确定要抓取的网页集合，这些URL形成一个队列。
下载和解析网页。爬虫从队列中取出一个URL，并下载此URL。然后对该URL对应的网页进行解析。
解析网页内容。爬虫下载网页后，需要解析这些内容，提取出有用的信息，如文本、图像和其他数据。
管理和调度任务。爬虫可能需要管理多个任务，包括URL队列、线程池和排重机制，以确保高效和稳定的运行。

保存数据。爬虫提取的信息需要被保存到适当的存储位置，如数据库、文件系统或其他数据存储服务。

博主曾经运营过一个段子网，当时写了个网络爬虫，抓取网上的段子到自己的网站数据库，以便我的段子网可以直接浏览这些内容。为便于观看，及便于让大家理解，以下代码我节选主要的内容，并在这次分享的过程中，加入了一些注释，以便大家理解。

应粉丝要求，增加了一篇文章，欢迎大家阅读：《网络爬虫和网站主的攻防--常见网站的反爬机制及应对》

一、根据上面的步骤，我们先建立URL队列，一般有3个队列（待抓取，已抓取，坏链）

using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using xu_common.log;
namespace data_collection.spider_core
{
    class UrlSet
    {
        public UrlSet()
        {
            // 定义三个队列，准备抓取的URL集合，已经抓取的URL集合，错误的URL集合
            // 这里需要注意，因为我是定向抓取，这里URL不会太多。如果是全网抓取，URL太多，要防止内存泄露
            _going_to_parse = new ArrayList(); 
            _already_parse = new ArrayList();
            _error_link = new ArrayList();
        }
        private static UrlSet __instance = null;
        public static UrlSet instance
        {
            get
            {
                if (__instance == null)
                {
                    __instance = new UrlSet();
                }
                return __instance;
            }            
        }
        private ArrayList _going_to_parse = null;
        private ArrayList _already_parse = null;
        private ArrayList _error_link = null;
        // 判断URL是否抓取过，并根据参数add是否true来判断这个URL是否入库
        private bool is_url_parsed(string url, bool add)
        {
            bool rv;
            lock (_already_parse.SyncRoot)
            {
                rv = _already_parse.Contains(url);
                if (!rv && add)
                    _already_parse.Add(url);
            }
            return rv;
        }
        // 判断URL是否抓取过
        private bool is_url_parsed(string url)
        {
            return is_url_parsed(url, false);
        }
        // 判断URL是否在待抓取列表，并根据参数add是否true来判断这个URL是否加入待抓取
        private bool is_url_going_to_parse(string url, bool add)
        {
            bool rv;
            lock (_going_to_parse.SyncRoot)
            {
                rv = _going_to_parse.Contains(url);
                if (!rv && add)
                    _going_to_parse.Add(url);
            }
            return rv;
        }
        // 判断URL是否在待抓取列表
        private bool is_url_going_to_parse(string url)
        {
            return is_url_going_to_parse(url, false);
        }
        // 判断URL是否在错误URL列表，并根据add来确定是否要加入此列表
        private bool is_url_error_lnk(string url, bool add)
        {
            bool rv;
            lock (_error_link.SyncRoot)
            {
                rv = _error_link.Contains(url);
                if (!rv && add)
                    _already_parse.Add(url);
            }
            return rv;
        }
        private bool is_url_error_lnk(string url)
        {
            return is_url_error_lnk(url, false);
        }
        /// 
        /// 把一个Url加到待解析列表中.
        /// 如果已经解析过,返回-1; 
        /// 如果是坏链,返回-2
        /// 如果已经在待解析列表中,返回1.
        /// 否则加入待解析列表,并且返回0
        /// 
        /// 
        /// >=0:OK, <0:ERROR
        public int add_going_parse_url(string url)
        {
            lock (_going_to_parse.SyncRoot)
            {
                if (is_url_parsed(url))
                {
                    return -1;
                }
                if (is_url_error_lnk(url))
                {
                    return -2;
                }
                if (is_url_going_to_parse(url, true))
                    return 1;
                //_going_to_parse.Add(url);
                return 0;
            }
        }
        /// 
        /// 添加一个已经抓取过的链接,如果此链接在待抓取或者坏链中,删除
        /// 如果已经在抓取过列表中,返回-1,否则返回0
        /// 
        /// 
        /// 0:OK,-1:ERROR
        public int add_parsed_url(string url)
        {
            // already parse, not use to parse again.
            if (is_url_going_to_parse(url))
            {
                _going_to_parse.Remove(url);
            }
            if (is_url_error_lnk(url))
            {
                _error_link.Remove(url);
                //return -1;
            }
            if (is_url_parsed(url, true))
            {
                return -1;
            }
            //_already_parse.Add(url);
            return 0;
        }
        /// 
        /// 添加一个错误的链接.如果该链接在待抓取列表中,删除(说明不应该抓取)
        /// 
        /// 
        /// 0:OK; -1:ERROR
        public int add_error_url(string url)
        {
            if (is_url_going_to_parse(url))
            {
                _going_to_parse.Remove(url);
            }
            /*
            if (is_url_parsed(url))
            {
                return -2;//都已经解析过了,还加进这里去干嘛? never go to here,因为解析过的话,不可能再拿来解析,然后到这里的
            }
             * */
            if (is_url_error_lnk(url, true))
            {
                return -1;
            }
            //_error_link.Add(url);
            return 0;
        }
        /// 
        /// 把代解析的第一个节点抓下来,成功,则url有值,并返回true, 不然返回错误
        /// 
        /// 
        /// 
        public bool pop_going_to_parse_url(ref string url)
        {
            url = "";
            bool rv = false;
            lock (_going_to_parse.SyncRoot)
            {
                if (_going_to_parse.Count <= 0)
                {
                    rv = false;
                }
                else 
                {
                    url = _going_to_parse[0].ToString();
                    _going_to_parse.RemoveAt(0);
                    rv = true;
                }
            }
            return rv;
        }
        public int going_to_parse_url_num()
        {
            int ret = 0;
            lock (_going_to_parse.SyncRoot)
            {
                ret = _going_to_parse.Count;
            }
            return ret;
        }
        private string[] _no_parse_keyword = null;
        private int _no_parse_type = 3;
        public void SetNoParseKeyWord(string str, string split, int type)
        {
            _no_parse_keyword = xu_common.CommonOperator.Split(str, split);
            _no_parse_type = type;
        }
        public bool IsNoParse(string url)
        {
            LogMsg.LogError(url + ", no_parse_type="+ _no_parse_type.ToString());
            if (_no_parse_type == 1)
            {
                for (int i = 0; i < _no_parse_keyword.Length; i++)
                {
                    if (url.Contains(_no_parse_keyword[i]))
                        return false;
                }
                return true;
            }
            else if(_no_parse_type==2)
            {
                for (int i = 0; i < _no_parse_keyword.Length; i++)
                {
                    if (url.Contains(_no_parse_keyword[i]))
                        return true;
                }
                return false;
            }
            return false;
        }
        #region write back to file: ToString
        public string GoingParseToString()
        {
            string ret = "";
            int count = _going_to_parse.Count;
            for (int i = 0; i < count-1; i++)
            {
                ret += _going_to_parse[i].ToString() + "\r\n";
            }
            if (count > 0)
                ret += _going_to_parse[count - 1].ToString();
            _going_to_parse.Clear();
            return ret;
        }
        public string AlreadyParsedToString()
        {
            string ret = "";
            int count = _already_parse.Count;
            for (int i = 0; i < count - 1; i++)
            {
                ret += _already_parse[i].ToString() + "\r\n";
            }
            if (count > 0)
                ret += _already_parse[count - 1].ToString();
            _already_parse.Clear();
            return ret;
        }
        public string ErrorUrlToString()
        {
            string ret = "";
            int count = _error_link.Count;
            for (int i = 0; i < count - 1; i++)
            {
                ret += _error_link[i].ToString() + "\r\n";
            }
            if (count > 0)
                ret += _error_link[count - 1].ToString();
            _error_link.Clear();
            return ret;
        }
        #endregion
    }
}

二、下载和解析网页

以下代码是一个线程抓取文件（非网页内容，如图片、文件）的主要代码：设置一个URL，抓取后把URL放到已抓取队列，并保存网页内容到文件中；抓取失败，则把URL放到错误队列。

有些注释掉的 MessageBox是博主当时调试用的，大家可忽略。

namespace data_collection.spider_core
{
    // 继承的Task类是博主写的公共类，这里不是关键代码，不再贴出。
    // 这里是一个子线程。需要设置抓取的URL和网页文件保存路径。为了加快抓取速度，需要启动多线程。
    class DownFileTask : xu_common.thread.Task
    {
        public override void Run()
        {
            FileStream fileStream = new FileStream(_filepath, FileMode.Append|FileMode.Create, FileAccess.Write);
            Stream inStream = null;
            try
            {
                HttpWebRequest myre = (HttpWebRequest)WebRequest.Create(_url);
                if (fileStream.Length == myre.ContentLength)
                {
                    //MessageBox.Show("你已完成下载该程序了", "ok");
                    return;
                }
                myre.AddRange(Convert.ToInt32(fileStream.Length));//接上次下载的字节开始下载文件   
                HttpWebResponse response = (HttpWebResponse)myre.GetResponse();
                inStream = response.GetResponseStream();
                //this.progressBar1.Maximum = total;
                //this.progressBar1.Minimum = 0;
                int length = 1024;
                byte[] buffer = new byte[1025];
                int readerLength = 0, currentLength = 0;
                while ((readerLength = inStream.Read(buffer, 0, length)) > 0)
                {
                    currentLength += readerLength;
                    fileStream.Write(buffer, 0, readerLength);
                    //this.progressBar1.Value = currentLength + countByte;
                    fileStream.Flush();
                }
                
                fileStream.Close();
                inStream.Close();
                //File.Delete(Application.StartupPath + @"\FileLength.txt");
                //MessageBox.Show("down   成功", "ok");
                // 抓取成功，这个URL放入已抓取队列
                UrlSet.instance.add_parsed_url(_url);
            }
            catch (Exception ex)
            {
                xu_common.log.LogMsg.LogError("down file:" + _url + ", error.msg:" + ex.ToString());
                // 抓取失败，这个URL放入已失败队列
                UrlSet.instance.add_error_url(_url);
            }
        }
        public void SetUrl(string url) { _url = url; }
        private string _url;
        public void SetFilePath(string filepath) { _filepath = filepath; }
        private string _filepath;
    }
}

以上是下载文件和图像的代码。

------------------------------------------

以下代码是下载网页的核心代码。

1.初始化代抓取的URL，设置浏览器代理（欺骗对应网站）。

2.下载URL对应的网页内容。

3.从URL的内容中解析结构化内容（根据规则），同时根据网页内容，解析到很多URL（网页的外链）。

3.1 如果URL是需要下载的文件或图片，则新起一个线程，用上面的代码下载。

3.2 如果URL是要继续抓取的网页URL，则放到待爬取链接。

注（重要）：此处，抓取到的内容（保存为_content)的解析，这里博主写了一个规则处理器。所以下面代码一句话带过。规则处理器比较简单，根据下载的网站不同。比如你想下载 http://xx.com/a/1.html，网站的内容，一般是通用的，我们就可以设置一个规则：只要满足URL为http://xx.com/a/*.html的，则按以下规则处理：跳过………………等代码，直到找到内容开始处，这里需要我们实际看一下http://xx.com/a/1.html的源码，一般是或等特征开始。我们把这些内容跳过即可。

using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using xu_common.log;
using System.Collections;
using System.IO;
using System.IO.Compression;
using System.Security.Policy;
namespace data_collection.spider_core
{
    class WebGetHtml
    {
        // 初始化要抓取的网页URL
        public WebGetHtml(string parse_url)
        {
            _url = parse_url;
        }
        string _url = null;
        string _tag_base_url = null;
        string _title;
        string _base_url = null;
        string _base_top_url = null;
        string _content;
        private void GenBaseUrl()
        {
            if (_tag_base_url == null)
            {
                _base_top_url = global_var.RegTopUrl.Match(_url).Value;
                string not_param_url = null;
                int qPisition = this._url.IndexOf("?");
                if (qPisition < 0)
                {
                    not_param_url = _url;
                }
                else
                {
                    not_param_url = _url.Substring(0, qPisition);
                }
                _base_url = Regex.Replace(not_param_url, "(?<=.*/)[^/]*$", "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            }
            else
            {
                _base_top_url = global_var.RegTopUrl.Match(_tag_base_url).Value;
                _base_url = _tag_base_url;
            }
            if (_base_url.EndsWith("/"))
            {
                _base_url = _base_url.Substring(0, _base_url.Length - 1);
            }
        }
        public class UrlType
        {
            public static int UrlTypeImg = 1;
            public static int UrlTypeFile = 2;
            public static int UrlTypeHtml = 3;
            public static int UrlTypeError = 4;
            public static int UrlTypeSelf = 5;
            public static int UrlTypeOtherFile = 6;
        };
        // 检查URL类型
        private int CheckUrl(string UrltoCheck)
        {
            if (Regex.IsMatch(UrltoCheck, "^#*$", RegexOptions.IgnoreCase | RegexOptions.Compiled))
                return UrlType.UrlTypeError;
            else if (UrltoCheck == _url || (UrltoCheck + "/") == _url || UrltoCheck == (_url + "/"))
                return UrlType.UrlTypeSelf;
            else if (UrltoCheck.EndsWith(".css"))
                return UrlType.UrlTypeOtherFile;
            else if (UrltoCheck.EndsWith(".wmv"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".asf"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".mp3"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".avi"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".mpg"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".mpeg"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".rmvb"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".rm"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".doc"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".rar"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".zip"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".tar"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".xls"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".pdf"))
                return UrlType.UrlTypeFile;
            else if (UrltoCheck.EndsWith(".jpg"))
                return UrlType.UrlTypeImg;
            else if (UrltoCheck.EndsWith(".jpeg"))
                return UrlType.UrlTypeImg;
            else if (UrltoCheck.EndsWith(".ico"))
                return UrlType.UrlTypeImg;
            else if (UrltoCheck.EndsWith(".gif"))
                return UrlType.UrlTypeImg;
            else if (UrltoCheck.EndsWith(".bmp"))
                return UrlType.UrlTypeImg;
            else if (UrltoCheck.EndsWith(".png"))
                return UrlType.UrlTypeImg;
            else if (UrltoCheck.StartsWith("ftp://"))
                return UrlType.UrlTypeError;
            else if (UrltoCheck.StartsWith("telnet://"))
                return UrlType.UrlTypeError;
            else if (UrltoCheck.StartsWith("mms://"))
                return UrlType.UrlTypeError;
            else if (UrltoCheck.StartsWith("rstp://"))
                return UrlType.UrlTypeError;
            else if (UrltoCheck.StartsWith("mailto"))
                return UrlType.UrlTypeError;
            else if (UrltoCheck.StartsWith("javascript"))
                return UrlType.UrlTypeError;
            else
                return UrlType.UrlTypeHtml;
        }
        //确定URL是否属于要抓取的网站
        private bool CheckUrlThisSite(string NewUrltoCheck)
        {
            //return Form1.instance.CheckUrlToParse(NewUrltoCheck);
            return global_var.Instance.IsInSite(NewUrltoCheck);            
        }
        // 有些网页的链接不是以http开头，而是相对链接。需要处理这些不规则链接
        private string GenUrl(string incomeUrl)
        {
            if (incomeUrl.StartsWith("http://"))
                return incomeUrl;
            else
            {
                /*
                 * 
                    /x.aspx
                    直接是base里的(或者窗口TopUrl)顶级域名,加上x.aspx
                    x.aspx
                    直接是base里的(或者窗口TopUrl)Url,加上x.aspx
                    ./x.aspx
                    直接是base里的(或者窗口TopUrl)Url,加上x.aspx(同上)
                    ../x.aspx
                    直接是base里的(或者窗口TopUrl)Url的上一层,加上x.aspx
                 */
 
                if (incomeUrl.StartsWith("/"))
                {                   
                    string trueUrl = _base_top_url + incomeUrl;
                    return trueUrl;
                }
                int parent_depth = 0;                
                while (incomeUrl.StartsWith("."))
                {
                    if (incomeUrl.StartsWith("../"))
                    {
                        parent_depth += 1;
                        incomeUrl.Substring(3, incomeUrl.Length - 3);
                    }
                    else if (incomeUrl.StartsWith("./"))
                    {
                        incomeUrl = incomeUrl.Substring(2, incomeUrl.Length - 2);
                    }
                    else
                        return null;
                }
                string head_str = _base_url;
                if (parent_depth > 0)
                {
                    for (int i = 0; i < parent_depth; i++)
                    {
                        int qposition = head_str.LastIndexOf("/");
                        if (qposition < 0)
                        {
                            // not_http_lenght_not_enough
                            head_str = _base_top_url;
                            break;
                        }
                        head_str = head_str.Substring(0, qposition);
                    }
                }
                if (head_str.StartsWith("http:") && head_str.Length < "http://".Length)
                {
                    //不是http开头的,长度不够的情况,在前面 not_http_lenght_not_enough 处理了
                    //说明:如果base_url=http://a.com/b/,但是有../../../a.html,没有足够的目录的情况下
                    //Url是指:http://a.com/b/a.html
                    head_str = _base_top_url;
                }
                return head_str + "/" + incomeUrl;
            }
        }
        //下载网页内容
        private bool WebGetContent()
        {
            HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(_url);
            // 设置浏览器代理
            myRequest.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36";
            //Encoding encode = System.Text.Encoding.GetEncoding(936);
            HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
            //myResponse.ContentEncoding;
            //LogMsg.instance.LOG(LogMsg.DEBUG, "begin to parse response");
            //LogMsg.LogDebug("begin to parse response");
            if (myResponse.StatusCode != HttpStatusCode.OK)
            {
                LogMsg.LogError("get url:" + _url + ", error. status:"+myResponse.StatusDescription);
                return false;
            }
            Stream src_stream = myResponse.GetResponseStream();
            StreamReader myStreamReader = null;
            if (myResponse.ContentEncoding != null &&
                myResponse.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
            {
                myStreamReader = new StreamReader(new GZipStream(src_stream, CompressionMode.Decompress),
                    Encoding.Default);
            }
            else
            {
                myStreamReader = new StreamReader(src_stream,
                    Encoding.Default);
            }
              
            ArrayList response_bytes_array = new ArrayList();
            int i_byte = 0;
            while (true)
            {
                i_byte = myStreamReader.BaseStream.ReadByte();
                if (i_byte == -1)
                    break;
                response_bytes_array.Add(Convert.ToByte(i_byte));
            }
            byte[] response_bytes = new byte[response_bytes_array.Count];
            for(int i=0;i 
三、管理和调度任务 
这里管理和调度任务不再赘述，如上面下载文件和图片一样，可以使用多线程进行下载。这些管理代码不难，且上面已有示例，下载网页的任务调度类似。保存在Url队列中，按顺序启动多线程去获取内容。 
四、保存数据 
这块代码比较隐私。博主在上面的代码处也有说明。是根据对应的网站，在获取网页源码内容后，需要自己写规则解析，解析到对应的内容后，直接调用DB操作类写入到数据库。 
以上就是本次分享的内容。如果有兴趣的，欢迎关注博主私聊。 
博主其它经典原创：《管理心得--工作目标应该是解决业务问题，而非感动自己》，《管理心得--如何高效进行跨部门合作》，《管理心得--员工最容易犯的错误：以错误去掩盖错误》，《技术心得--如何成为优秀的架构师》、《管理心得--如何成为优秀的架构师》、《管理心理--程序员如何选择职业赛道》。欢迎大家阅读。
  免费二级域名分发网站  贵州网站建设平台  做网站建设哪家好  一键优化  长安网站建设设计  云速建站

上一篇：MySQL面试八股文（2022最新整理）

下一篇：Springboot解决跨域问题方案总结(包括Nginx，Gateway网关等)

网终爬虫基本原理可以概括为以下几个步骤：

应粉丝要求，增加了一篇文章，欢迎大家阅读：《网络爬虫和网站主的攻防--常见网站的反爬机制及应对​​​​​​​》

一、根据上面的步骤，我们先建立URL队列，一般有3个队列（待抓取，已抓取，坏链）

二、下载和解析网页

三、管理和调度任务

四、保存数据

应粉丝要求，增加了一篇文章，欢迎大家阅读：《网络爬虫和网站主的攻防--常见网站的反爬机制及应对》