之前寫文件搜尋時候使用的PDF擷取器, 功能為將PDF內容抽出存成文字檔, 發布出來給大家參考。
首先要下載所需套件pdfBox: http://www.pdfbox.org/, 下載後將src/PDFBox-0.7.3.jar與external/FontBox-0.1.0-dev.jar置放到classpath中。
import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; public class PDFParser { public void readPdf(String file) throws Exception { boolean sort = false; // is sort. String pdfFile = file; // pdf file name. String textFile = null; // output file. String encoding = "UTF-8"; // encode type. int startPage = 1; // parse start page no. int endPage = Integer.MAX_VALUE; // parse end page no. Writer output = null; // output writer. PDDocument document = null; // PDF Document. try { URL url = new URL(pdfFile); // create connection from source file. document = PDDocument.load(pdfFile); // use PDDocument to load file. String fileName = url.getFile(); // get pdf's name. if (fileName.length() > 4) { // name output file. File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt"); textFile = outputFile.getName(); } // create connection to output file. output = new OutputStreamWriter(new FileOutputStream(textFile), encoding); // use PDFTextStripper to get content. PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(sort); // set sort. stripper.setStartPage(startPage); // set start page. stripper.setEndPage(endPage); // set end page. // write to output file. stripper.writeText(document, output); } catch (MalformedURLException e) { e.printStackTrace(); } finally { // close all stream. if (output != null) { output.close(); } if (document != null) { document.close(); } } } public static void main(String[] args) { PDFParser pdfReader = new PDFParser(); try { // read pdf content. pdfReader.readPdf("place your pdf location here"); } catch (Exception e) { e.printStackTrace(); } } }
沒有留言:
張貼留言