从网页抓取的div转化成html再转换为pdf

主要的操作步骤,抓取,利用ftl生成html,利用html生成pdf文件,最后删除html文件

package com.nriet.business.service.tflj.impl; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import com.nriet.business.bean.tflj.TZywwBeCpwjxxBean; import com.nriet.business.dao.tflj.TfljDao; import com.nriet.business.service.tflj.TfljService; import com.nriet.business.util.DateUtil; import com.nriet.business.util.FkUtil; import com.nriet.business.util.PdfUtil; import com.nriet.business.util.ZqtpUtil; import com.nriet.common.util.StringUtil; public class TfljServiceImpl{private static final String URL = "http://www.nm110.cn/psh/counry/waring/tyhoon.html"; //采集地址 private static final String PATH = "E:\\data\\share\\product\\DATA\\product\\"; //pdf存储路径public void receiveURL() { Connection conn = Jsoup.connect(URL); conn.header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"); Document doc; try { //设置超时时间,同时Document对象中封装了返回的html文档 doc = conn.timeout(100000).get(); Element mainContent = doc.getElementById("maincontent"); //所需的div Element text = mainContent.getElementById("text"); //日期 Elements dateTag = text.getElementsByTag("b"); String year = dateTag.get(0).text(); String month = dateTag.get(1).text(); String day = dateTag.get(2).text(); String rq=year+"-"+month+"-"+day; String fileName = "66666"; // 遍历b标签中的内容 for (int i = 0; i < dateTag.size(); i++) { if (i >= 4) { break; } fileName += dateTag.get(i).text(); } //临时html文件名 String htmlName = fileName + ".html"; //pdf文件名 String pdfName = fileName + ".pdf"; File pdfFile = new File(PATH + year + "\\" + year + month + "\\" + pdfName); // 如果PDF文件不存在,创建 if (!pdfFile.exists()) { Map m = new HashMap(); m.put("tfgb", text); FkUtil f = new FkUtil(); //生成临时html文件 f.html(m, PATH + year + "\\" + year + month + "\\" + htmlName); PdfUtil pdf = new PdfUtil(); pdfFile.getParentFile().mkdirs(); //创建pdf文件 pdf.createPdf(PATH + year + "\\" + year + month + "\\" + htmlName,PATH + year + "\\" + year + month + "\\" + pdfName); //删除临时html文件 File htmlFile = new File(PATH + year + "\\" + year + month + "\\" + htmlName); // 如果文件路径所对应的文件存在,并且是一个文件,删除 if (htmlFile.exists() && htmlFile.isFile()) { htmlFile.delete(); } } } catch (IOException e) { e.printStackTrace(); } } }

【从网页抓取的div转化成html再转换为pdf】这个是模板
#text { padding: 10px; min-height: 400px; font-family: simsun; } #text div{ margin: 0; word-break: break-all; word-wrap: break-word; } .title { color: red; font-size: 34px; font-weight: bold; text-align: center; height: 40px; line-height: 40px; } .author { font-size: 14px; padding: 10px 0; text-align: center; } .writing{ padding: 0; font-size: 14px; } p{ margin:2px auto; font-size: 16px; text-indent: 2em; } .subhead { font-size: 18px; font-weight: bold; text-align: center; padding-bottom: 10px; } ${tfgb}

根据抓取到信息的,利用ftl生成html文件
package com.nriet.business.util; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.Locale; import java.util.Map; import javax.servlet.ServletContext; import javax.servlet.http.HttpServletRequest; import org.springframework.web.context.ContextLoader; import com.nriet.business.service.tflj.impl.TfljServiceImpl; import freemarker.cache.MruCacheStorage; import freemarker.template.Configuration; import freemarker.template.Template; import freemarker.template.TemplateException; public class FkUtil { // 模板路径 private static final String ftl_PATH = "templates"; // html模板文件 private static final String ftl = "ftl.ftl"; /** * 生成html文件 * * @param rootMap 数据 * @param name 文件路径+文件名称 */ public void html(Map rootMap, String name) { Configuration cfg = new Configuration(); try { cfg.setDefaultEncoding("UTF-8"); cfg.setEncoding(Locale.getDefault(), "UTF-8"); //设定去哪里读取相应的ftl模板,此方法模板的路径由第一个参数的类决定 cfg.setClassForTemplateLoading(this.getClass(),ftl_PATH ); //cfg.setClassForTemplateLoading(TfljServiceImpl.class,TEMPLATE_PATH); // 设定去哪里读取相应的ftl模板,此方法使用web容器加载,无论模板放在项目中的何处都可以找到 //cfg.setServletContextForTemplateLoading(ContextLoader.getCurrentWebApplicationContext().getServletContext(), "/WEB-INF/config/templates/tflj"); // 获取目标模板 Template template = cfg.getTemplate(ftl); // 创建目标html文件 File file = new File(name); // 创建字符输出流 Writer writer = new OutputStreamWriter(new FileOutputStream(file), "UTF-8"); // map用来给模板传递数据,writer将模板内容写入到上面新建的html文件 template.process(rootMap, writer); writer.flush(); writer.close(); } catch (IOException | TemplateException e1) { e1.printStackTrace(); } } }

html文件生成好了,我们就能生成pdf了
package com.nriet.business.util; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.nio.charset.Charset; import com.itextpdf.text.Document; import com.itextpdf.text.DocumentException; import com.itextpdf.text.pdf.PdfWriter; import com.itextpdf.tool.xml.XMLWorkerHelper; public class PdfUtil { /** * * @param pdf PDF存储路径 * @throws html html存储路径 */ public void createPdf(String html,String pdf){ // step 1 Document document = new Document(); // step 2 PdfWriter writer = null; try { writer = PdfWriter.getInstance(document, new FileOutputStream(pdf)); // step 3 document.open(); // step 4 XMLWorkerHelper.getInstance().parseXHtml(writer,document, new FileInputStream(html),Charset.forName("UTF-8")); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (DocumentException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // step 5 document.close(); } }

jar包及jdk
从网页抓取的div转化成html再转换为pdf
文章图片

    推荐阅读