c# 使用HTML解析器NSoup爬取小说
闲着没事,想试一试爬取一些小说,看了下园子里很多前辈写得一些文章很受启发。
说下我的思路:查看文章网页链接---->后台远程抓取到Html代码---->分析所需数据结构----->提取所需信息
在这其中则免不了对html的一些操作。
方法很多种,具体移步前辈文章:https://www.cnblogs.com/cang12138/p/7464226.html?utm_source=debugrun&utm_medium=referral
在这里我贴出我自己测试过的代码,以此记录一下
using NSoup;
using NSoup.Nodes;
using NSoup.Select;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web.Mvc;
namespace PaChongDemo.Controllers
{
public class HomeController : Controller
{
//定义要爬取网站的网址集合
string[] urlArray = new string[] { "https://www.ddxsku.com/files/article/html/23/23024/index.html", "https://www.ddxsku.com/files/article/html/2/2739/index.html" };
public ActionResult Index()
{foreach (var item in urlArray)
{
NSoup(item);
}
return View();
}///
/// 访问数据
///
///
///
///
public string HttpGet(string Url, string postDataStr)
{
// HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
// request.Method = "GET";
// request.CookieContainer = new CookieContainer();
// request.Accept = "*/*";
request.ServicePoint.Expect100Continue = false;
// //request.Timeout = 30000;
// 设置连接超时时间
// //request.Headers.Set("Pragma", "no-cache");
// request.UserAgent = "Mozilla-Firefox-Spider(Wenanry)";
// request.Headers.Add("Accept-Language", "zh-cn,zh;
q=0.8,en-us;
q=0.5,en;
q=0.3");
// HttpWebResponse response;
// request.ContentType = "text/html;
charset=UTF-8";
// try
// {
//response = (HttpWebResponse)request.GetResponse();
// }
// catch (WebException ex)
// {
//response = (HttpWebResponse)request.GetResponse();
// }
// Stream myResponseStream = response.GetResponseStream();
// StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
// string retString = myStreamReader.ReadToEnd();
// myStreamReader.Close();
// myResponseStream.Close();
// return retString;
CookieContainer cookie = new CookieContainer();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
request.CookieContainer = cookie;
Stream myRequestStream = request.GetRequestStream();
StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
myStreamWriter.Write(postDataStr);
myStreamWriter.Close();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
response.Cookies = cookie.GetCookies(response.ResponseUri);
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}///
/// 创建文本
///
/// 内容
/// 名字
/// 路径
public void Novel(string content, string name, string path)
{string Log = content + "\r\n";
//创建文件夹,如果不存在就创建file文件夹
if (Directory.Exists(path) == false)
{
Directory.CreateDirectory(path);
}//判断文件是否存在,不存在则创建
if (!System.IO.File.Exists(path + '/' + name + ".txt"))
{
FileStream fs1 = new FileStream(path + '/' + name + ".txt", FileMode.Create, FileAccess.Write);
//创建写入文件
StreamWriter sw = new StreamWriter(fs1);
sw.WriteLine(Log);
//开始写入值
sw.Close();
fs1.Close();
}
else
{
FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
StreamWriter sr = new StreamWriter(fs);
sr.WriteLine(Log);
//开始写入值
sr.Close();
fs.Close();
}
}///
/// 使用HtmlagilityPack方式解析
///
///
//public void HtmlagilityPack(string Url = "")
//{
//HtmlWeb webClient = new HtmlWeb();
//webClient.OverrideEncoding = Encoding.GetEncoding("utf-8");
//编码,这里网上有些很多写法都不正确
//HtmlDocument doc = webClient.Load(Url);
//HtmlNodeCollection anchors = doc.DocumentNode.SelectNodes("//class[@article_texttitleb]");
//string sss = "";
//foreach (var htmlNode in anchors)
//{
//int indexnum = anchors.IndexOf(htmlNode);
//sss += htmlNode.InnerHtml;
//}
//}///
/// 使用HTML解析器NSoup方式解析
///
///
public void NSoup(string Url = "")
{
Document doc = NSoupClient.Connect(Url).Get();
Elements titles = doc.GetElementsByTag("title");
//获取题目
string path = Server.MapPath("/Content/" + titles.Text + "");
Elements cataLog = doc.GetElementsByClass("at");
//获取 目录
Document docChild = NSoupClient.Parse(cataLog.ToString());
Elements eleChild = docChild.GetElementsByTag("a");
//查找a标签foreach (var item in eleChild)
{
string tile = item.Text();
//获取章节标题
string htmlChildUrl = item.Attr("href").ToString().Trim();
Document docTwo = NSoupClient.Connect(htmlChildUrl).Get();
Element conTent = docTwo.GetElementById("contents");
string txtContent = conTent.Text();
Novel(txtContent,KillBadChar(tile), path);
}
}///
/// 去掉特殊字符避免题目报错
///
///
///
public string KillBadChar(string charStr)
{
string reg = @"\:" + @"|\;
" + @"|\/" + @"|\\" + @"|\|" + @"|\," + @"|\*" + @"|\?" + @"|\""" + @"|\<" + @"|\>";
//特殊字符
Regex r = new Regex(reg);
return r.Replace(charStr, "");
//将特殊字符替换为""
}
}
}
【c# 使用HTML解析器NSoup爬取小说】
推荐阅读
- 由浅入深理解AOP
- 【译】20个更有效地使用谷歌搜索的技巧
- mybatisplus如何在xml的连表查询中使用queryWrapper
- MybatisPlus|MybatisPlus LambdaQueryWrapper使用int默认值的坑及解决
- MybatisPlus使用queryWrapper如何实现复杂查询
- django-前后端交互
- iOS中的Block
- Linux下面如何查看tomcat已经使用多少线程
- 使用composer自动加载类文件
- android|android studio中ndk的使用