86 lines
2.8 KiB
Java
86 lines
2.8 KiB
Java
package com.ag.demo;
|
|
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
import cn.hutool.core.convert.Convert;
|
|
import com.ag.util.StrUtil;
|
|
import com.alibaba.fastjson.JSON;
|
|
import org.apache.pdfbox.multipdf.Splitter;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
import org.junit.Test;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.util.LinkedHashMap;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
|
|
public class PdfSplitService {
|
|
|
|
String sourcePath = "F:\\pdfsplit\\MDB1R981_678692.pdf";
|
|
String targetPath = "F:\\pdfsplit\\";
|
|
|
|
@Test
|
|
public void getPdfTextByPages() throws IOException {
|
|
PDDocument document = PDDocument.load(new File(sourcePath));
|
|
document.setAllSecurityToBeRemoved(true);
|
|
PDFTextStripper stripper = new PDFTextStripper();
|
|
stripper.setSortByPosition(true);
|
|
List<List<String>> txts = CollUtil.list(false);
|
|
for(int i = 1; i <= document.getNumberOfPages() ; i++) {
|
|
stripper.setStartPage(i);
|
|
stripper.setEndPage(i);
|
|
String text = stripper.getText(document);
|
|
if(!StrUtil.isEmpty(text)){
|
|
List<String> rows = CollUtil.list(false);
|
|
String[] arrs = text.split("\\r\\n");
|
|
for (String row : arrs){
|
|
if (!StrUtil.isEmpty(row.trim())){
|
|
rows.add(row);
|
|
}
|
|
}
|
|
txts.add(rows);
|
|
}
|
|
}
|
|
|
|
Map<String, String> map = new LinkedHashMap<>();
|
|
String billNo = "";
|
|
for (int i=0; i<txts.size(); i++){
|
|
List<String> txt = txts.get(i);
|
|
boolean hasNumber = false;
|
|
for (String row : txt){
|
|
if(StrUtil.isNumeric(row.trim())){
|
|
System.out.println(row);
|
|
hasNumber = true;
|
|
billNo = row.trim();
|
|
break;
|
|
}
|
|
}
|
|
if (hasNumber){
|
|
map.put(billNo, i+"_"+i);
|
|
}else {
|
|
map.put(billNo, (i-1)+"_"+i);
|
|
}
|
|
//System.out.println("page:~~~~~~~~~~~~~~~~"+(i+1));
|
|
}
|
|
System.out.println(JSON.toJSONString(map));
|
|
Splitter splitter = new Splitter();
|
|
for (Map.Entry<String, String> entry : map.entrySet()) {
|
|
String val = entry.getValue();
|
|
int start = Convert.toInt(val.split("_")[0])+1;
|
|
int end = Convert.toInt(val.split("_")[1])+1;
|
|
splitter.setStartPage(start);
|
|
splitter.setEndPage(end);
|
|
splitter.setSplitAtPage(end-start+1);
|
|
List<PDDocument> pages = splitter.split(document);
|
|
for (PDDocument pd : pages) {
|
|
String pdfName = targetPath + entry.getKey() + ".pdf";
|
|
pd.save(pdfName);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
}
|