c# 使用HTML解析器NSoup爬取小说

闲着没事,想试一试爬取一些小说,看了下园子里很多前辈写得一些文章很受启发。
说下我的思路:查看文章网页链接---->后台远程抓取到Html代码---->分析所需数据结构----->提取所需信息
在这其中则免不了对html的一些操作。
方法很多种,具体移步前辈文章:https://www.cnblogs.com/cang12138/p/7464226.html?utm_source=debugrun&utm_medium=referral
在这里我贴出我自己测试过的代码,以此记录一下

using NSoup; using NSoup.Nodes; using NSoup.Select; using System.IO; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Web.Mvc; namespace PaChongDemo.Controllers { public class HomeController : Controller { //定义要爬取网站的网址集合 string[] urlArray = new string[] { "https://www.ddxsku.com/files/article/html/23/23024/index.html", "https://www.ddxsku.com/files/article/html/2/2739/index.html" }; public ActionResult Index() {foreach (var item in urlArray) { NSoup(item); } return View(); }/// /// 访问数据 /// /// /// /// public string HttpGet(string Url, string postDataStr) { // HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr); // request.Method = "GET"; // request.CookieContainer = new CookieContainer(); // request.Accept = "*/*"; request.ServicePoint.Expect100Continue = false; // //request.Timeout = 30000; // 设置连接超时时间 // //request.Headers.Set("Pragma", "no-cache"); // request.UserAgent = "Mozilla-Firefox-Spider(Wenanry)"; // request.Headers.Add("Accept-Language", "zh-cn,zh; q=0.8,en-us; q=0.5,en; q=0.3"); // HttpWebResponse response; // request.ContentType = "text/html; charset=UTF-8"; // try // { //response = (HttpWebResponse)request.GetResponse(); // } // catch (WebException ex) // { //response = (HttpWebResponse)request.GetResponse(); // } // Stream myResponseStream = response.GetResponseStream(); // StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8")); // string retString = myStreamReader.ReadToEnd(); // myStreamReader.Close(); // myResponseStream.Close(); // return retString; CookieContainer cookie = new CookieContainer(); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); request.Method = "POST"; request.ContentType = "application/x-www-form-urlencoded"; request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr); request.CookieContainer = cookie; Stream myRequestStream = request.GetRequestStream(); StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312")); myStreamWriter.Write(postDataStr); myStreamWriter.Close(); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); response.Cookies = cookie.GetCookies(response.ResponseUri); Stream myResponseStream = response.GetResponseStream(); StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8")); string retString = myStreamReader.ReadToEnd(); myStreamReader.Close(); myResponseStream.Close(); return retString; }/// /// 创建文本 /// /// 内容 /// 名字 /// 路径 public void Novel(string content, string name, string path) {string Log = content + "\r\n"; //创建文件夹,如果不存在就创建file文件夹 if (Directory.Exists(path) == false) { Directory.CreateDirectory(path); }//判断文件是否存在,不存在则创建 if (!System.IO.File.Exists(path + '/' + name + ".txt")) { FileStream fs1 = new FileStream(path + '/' + name + ".txt", FileMode.Create, FileAccess.Write); //创建写入文件 StreamWriter sw = new StreamWriter(fs1); sw.WriteLine(Log); //开始写入值 sw.Close(); fs1.Close(); } else { FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write); StreamWriter sr = new StreamWriter(fs); sr.WriteLine(Log); //开始写入值 sr.Close(); fs.Close(); } }/// /// 使用HtmlagilityPack方式解析 /// /// //public void HtmlagilityPack(string Url = "") //{ //HtmlWeb webClient = new HtmlWeb(); //webClient.OverrideEncoding = Encoding.GetEncoding("utf-8"); //编码,这里网上有些很多写法都不正确 //HtmlDocument doc = webClient.Load(Url); //HtmlNodeCollection anchors = doc.DocumentNode.SelectNodes("//class[@article_texttitleb]"); //string sss = ""; //foreach (var htmlNode in anchors) //{ //int indexnum = anchors.IndexOf(htmlNode); //sss += htmlNode.InnerHtml; //} //}/// /// 使用HTML解析器NSoup方式解析 /// /// public void NSoup(string Url = "") { Document doc = NSoupClient.Connect(Url).Get(); Elements titles = doc.GetElementsByTag("title"); //获取题目 string path = Server.MapPath("/Content/" + titles.Text + ""); Elements cataLog = doc.GetElementsByClass("at"); //获取 目录 Document docChild = NSoupClient.Parse(cataLog.ToString()); Elements eleChild = docChild.GetElementsByTag("a"); //查找a标签foreach (var item in eleChild) { string tile = item.Text(); //获取章节标题 string htmlChildUrl = item.Attr("href").ToString().Trim(); Document docTwo = NSoupClient.Connect(htmlChildUrl).Get(); Element conTent = docTwo.GetElementById("contents"); string txtContent = conTent.Text(); Novel(txtContent,KillBadChar(tile), path); } }/// /// 去掉特殊字符避免题目报错 /// /// /// public string KillBadChar(string charStr) { string reg = @"\:" + @"|\; " + @"|\/" + @"|\\" + @"|\|" + @"|\," + @"|\*" + @"|\?" + @"|\""" + @"|\<" + @"|\>"; //特殊字符 Regex r = new Regex(reg); return r.Replace(charStr, ""); //将特殊字符替换为"" } } }

【c# 使用HTML解析器NSoup爬取小说】

    推荐阅读