校园助手APP--爬取教务处网页，并解析出数据 Android项目

在使用服务器前，我是直接爬取网页数据的，包括做到最后也还是采用直接爬教务处网页获得通知内容的。这种方式在有些时候也是会用得上的，在此介绍一下

public class InternetHelper { private static final String TAG = "InternetHelper"; public static final String URL_BASE = "http://61.183.207.40/zjdxgc/(kgd5dczwtsnv50yznsqeuh55)/"; public static final String USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; public static final String HOST = "http://61.183.207.40"; private static CookieStore cookie = null; /** * 单例模式，生成HttpClient对象，并进行请求参数封装 * @return HttpClient对象 */ public static DefaultHttpClient getClient(){ DefaultHttpClient client = null; if (null == client) { HttpParams httpParams = new BasicHttpParams(); httpParams.setParameter("http.protocol.allow-circular-redirects", Boolean.valueOf(true)); httpParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); HttpClientParams.setRedirecting(httpParams, true); //设置编码 HttpProtocolParams.setContentCharset(httpParams, HTTP.UTF_8); HttpProtocolParams.setUseExpectContinue(httpParams, true); HttpProtocolParams.setUserAgent(httpParams, USER_AGENT); HttpConnectionParams.setTcpNoDelay(httpParams, true); //关闭旧连接检查，提升速度 HttpConnectionParams.setStaleCheckingEnabled(httpParams, false); ////从连接池中取连接的超时时间 //ConnManagerParams.setTimeout(httpParams, 1000); ////连接超时 //HttpConnectionParams.setConnectionTimeout(httpParams, 2000); ////请求超时 //HttpConnectionParams.setSoTimeout(httpParams, 4000); //设置httpClient支持HTTP和HTTPS两种模式 SchemeRegistry schReg = new SchemeRegistry(); schReg.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80)); schReg.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443)); //使用线程安全的连接管理 ClientConnectionManager conMgr = new ThreadSafeClientConnManager(httpParams, schReg); client = new DefaultHttpClient(conMgr, httpParams); }if (null != cookie) { client.setCookieStore(cookie); Log.i(TAG, cookie.toString()); }return client; } /** * 通过URL获取含有新闻的网页源码 * @param url_news * @return * @throws Exception */ public static String getNewsHtmlByURL(String url_news) throws Exception {DefaultHttpClient client = InternetHelper.getClient(); HttpGet localHttpGet = new HttpGet(url_news); String referer = url_news; localHttpGet.setHeader("Referer", referer); localHttpGet.setHeader("User-Agent", USER_AGENT); HttpResponse httpResponse = client.execute(localHttpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); System.out.println("statusCode————————————————>" + statusCode); String html = null; if (statusCode == 400) { HttpEntity localHttpEntity = httpResponse.getEntity(); System.out.println("出错了，400，下面是得到的html代码："+ EntityUtils.toString(localHttpEntity, "gb2312")); return null; }else if (statusCode == 302) { //网页跳转//从头信息中获取跳转地址 Header[] arrayOfHeader = httpResponse.getHeaders("Location"); String location = HOST + arrayOfHeader[0].getValue(); HttpGet httpGet = new HttpGet(location); httpGet.setHeader("Referer", location); html = EntityUtils.toString(client.execute(httpGet).getEntity(), "gb2312"); httpGet.abort(); } else if (statusCode == 200){ html = EntityUtils.toString(httpResponse.getEntity(),"gb2312"); } return html; }}

直接访问网页需要添加一些头信息，具体需要哪些头信息就需要通过抓包来得到，换了几次电脑，忘了当时是用的chrome的什么插件这样获取得到的是整个网页的源码，需要解析源码获得需要的信息，首先是教务处通知的列表信息，包含通知的标题以及相应的链接
【校园助手APP--爬取教务处网页，并解析出数据】

/** * 通过获取的含有新闻标题的网页源码获取新闻列表 * @param html * @return */ private static List getNewsByHtml(String html) {List newsList = new ArrayList(); String reg = "(]*&todo=show\"target=\"_blank\">).*?()"; Pattern pattern = Pattern.compile(reg); Matcher matcher = pattern.matcher(html); while (matcher.find()) { String titleStr = matcher.group(0); String title = titleStr.substring(titleStr.indexOf('】')+1, titleStr.indexOf("




 

 不断的分析源码，通过正则表达式获得标题及链接，封装为一个列表显示出来，点击列表中的某一条标题，就跳到该通知的详细内容页面。 通过链接可以获取到详细通知的网页源码，由于源码中包含太多的标签信息，为方便找到正文开始的地方，就通过通知的标题来定位：


 


 
	/**
* 通过点击的新闻标题得到News实体
* @param title
* @return
* @throws Exception 
*/
	public News getNewsByTitle(String title) throws Exception {SQLiteDatabase database = dbHelper.getWritableDatabase();

Cursor cursor = database.query(News.TABLE_NAME, new String[]{News.ID,News.URL,News.SOURCE,News.TIME,News.CONTENT}, News.TITLE+"=?", new String[]{title}, null, null, null);
News news = new News();

if(cursor.moveToFirst()){
news.setId(cursor.getLong(cursor.getColumnIndex(News.ID)));

news.setTime(cursor.getString(cursor.getColumnIndex(News.TIME)));

news.setSource(cursor.getString(cursor.getColumnIndex(News.SOURCE)));

news.setContent(cursor.getString(cursor.getColumnIndex(News.CONTENT)));

news.setUrl(cursor.getString(cursor.getColumnIndex(News.URL)));

news.setTitle(title);

}//如果数据库中的news数据没有新闻主体部分则联网获取
if (null == news.getContent()) {
String url = news.getUrl();
String html = InternetHelper.getNewsHtmlByURL(url);
html = html.substring(html.indexOf(title)+title.length());
//去头
String regexstr = "<(?!p|/p).*?>";

html = html.replaceAll(regexstr, "");
//去HTML标签
html = html.replace(" ", " ");
//去空格//获取新闻内容
String content = html.substring(html.indexOf("教务处")+3, html.lastIndexOf("机构设置")).trim();
//去尾
//获取发布时间
String time = html.substring(html.indexOf("发布时间：")+5, html.indexOf("点击次数"));
news.setTime(time);

news.setContent(content);
//更新数据库中的数据，主要是加入content和time
updateNews(news);

}
database.close();

return news;

	}



 

 去掉网页源码的头尾后，就能够清晰地看到内容部分及一些网页标签，再次通过正则表达式解析出正文内容。 因为获取到通知信息后，我是存到数据库的，所以初始化时会先从数据库获取信息，只是还没来得及做列表刷新


 



 这里特别提出一个地方


 String regexstr = "<(?!p|/p).*?>";

 html = html.replaceAll(regexstr, "");
//去HTML标签
 html = html.replace(" 
", " ");
//去空格



 真是好用！


 很好地把正文内容分里出来了


 




		  	

    
    




    
    
    


推荐阅读

           
                  
              
                  redmi|再爆新品！Redmi Watch 2将于28日发布 这次有点猛！ 
                
                   
                
              
            

                  
              
                  长沙跨境电商公司 长沙电商平台众联什么样，长沙比较大的电商公司有哪些 
                
                   
                
              
            

                  
              
                  香榧吃了会不会胖 香榧相关介绍 
                
                   
                
              
            

                  
              
                  如何解决人脸识别服务器超时问题？ 人脸识别服务器超时怎么办 
                
                   
                
              
            

                  
              
                  贫血吃什么 
                
                   
                
              
            

                  
              
                  违章代码1355是什么意思 
                
                   
                
              
            

                  
              
                  财务报表分析程序 
                
                   
                
              
            

                  
              
                  学什么舞蹈能提升气质 
                
                   
                
              
            

                  
              
                  云朵简笔画 云朵简笔画怎么画 
                
                   
                
              
            

                  
              
                  别客气用英语怎么说 welcome 
                
                   
                
              
            

                  
              
                  分布式理论 
                
                   
                
              
            

                  
              
                  golang获取命令行参数方法总结 
                
                   
                
              
            

                  
              
                  玻尿酸丰胸多少钱_玻尿酸丰胸和假体丰胸哪种好 
                
                   
                
              
            

                  
              
                  四年级我就这样长大了作文3篇 
                
                   
                
              
            

                  
              
                  QQ飞车手游荣耀勋章第9期车大全：荣耀勋章第九期全部奖励总汇 
                
                   
                
              
            

                  
              
                  唐朝皇帝后宫女人等级是如何，唐朝宫女等级制度 
                
                   
                
              
            

                  
              
                  刚做完流产已经三天没吃饭只喝点水解 
                
                   
                
              
            

                  
              
                  喊山红霞被判几年 
                
                   
                
              
            

                  
              
                  gis罐体外卡式，gis罐体常用材料 
                
                   
                
              
            

                  
              
                  大巴车载客人数 中型客车核载多少人 
                
                   
                
              
            

          

Android|关于Android中xstreamjar冲突