js|node.js爬取国家统计局全国行政区数据
github地址:https://github.com/username-xu/node-districts
const fs = require('fs');
// 网络请求,文档可参考:https://www.jianshu.com/p/1432e0f29abd
const superagent = require('superagent');
// 设置编码格式,文档:https://www.npmjs.com/package/superagent-charset
require('superagent-charset')(superagent);
// DOM操作,语法类似jquery,文档可参考:https://www.jianshu.com/p/629a81b4e013
const cheerio = require('cheerio');
const main = () => {
getSheng();
}// 获取页面
const getPage = async (url2) => {// 阻塞停顿,防止请求过快,被防火墙拦截
sleep(1000);
const url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/'
let response = '';
try{
response = (await superagent
.get(url1 + url2)
.set({
// 模拟浏览器请求
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;
Win64;
x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
'Accept-Encoding': 'gzip, deflate',
'content-type': 'text/html',
'Content-Length': Buffer.byteLength(""),
'Connection': 'keep-alive'
})
.buffer(true)
// 设置编码格式
.charset('gb2312')).text;
} catch(err){
console.log(err)
}return response;
}// 获取省
const getSheng = async () => {
let time = new Date().getTime();
let timer = setInterval(() => {
console.log('抓取中 '+ Math.floor((new Date().getTime() - time) / 1000));
},1000)let list = [];
let page = await getPage('index.html');
let $ = cheerio.load(page);
let htmlList = $('.provincetr td a');
for(let i = 0;
i < htmlList.length;
i++){
let url = $(htmlList[i]).attr('href');
let item = {
name: $(htmlList[i]).text(),
code: url.slice(0, 2) + '0000',
children: []
};
list.push(await getShi(url, item));
}output(list);
clearInterval(timer);
}// 获取市
const getShi = async (url, shengItem) => {
let page = await getPage(url);
let $ = cheerio.load(page);
let htmlList = $('.citytr');
for(let i = 0;
i < htmlList.length;
i++){
let first = $(htmlList[i]).find('td').first().find('a');
let last = $(htmlList[i]).find('td').last().find('a');
let itemUrl = $(first).attr('href');
let item = {
name: $(last).text(),
code: $(first).text().slice(0, 6),
children: []
}shengItem.children.push(await getQu(itemUrl, item));
}return shengItem;
}// 获取区
const getQu = async (itemUrl, shiItem) => {
let page = await getPage(itemUrl);
let $ = cheerio.load(page);
let htmlList = $('.countytr');
for(let i = 0;
i < htmlList.length;
i++){
let first = $(htmlList[i]).find('td').first().find('a');
let last = $(htmlList[i]).find('td').last().find('a');
if($(last).text() && $(first).text()){
let item = {
name: $(last).text(),
code: $(first).text().slice(0, 6)
}shiItem.children.push(item);
}}return shiItem;
}const sleep = d => {
let t = new Date().getTime();
while(new Date().getTime() - t <= d){}
}// 输出
const output = data => {
let dataStr = JSON.stringify(data);
fs.writeFileSync(
'data.json',
dataStr,
function(err){
if(err){
console.log(err);
}
}
)
}main();
【js|node.js爬取国家统计局全国行政区数据】
推荐阅读
- Node.js中readline模块实现终端输入
- 使用协程爬取网页,计算网页数据大小
- 国家宝藏观后有感
- 《别的国家都没有》与《帕特森》
- 以史之名,缅怀先烈;以烛之光,予之以暖。——淮北师范大学南京大屠杀死难者国家公祭日纪念活动
- Python实战计划学习笔记(9)为大规模爬取准备
- 国家重点支持的高新技术领域(一)
- 2020年国家公务员考试(“可以推出”是什么意思)
- gcb雷达钱包全球付,没有国家主权,只有全民主权!
- 爬取网易云音乐