2008年11月26日 星期三

Java PDFParser


之前寫文件搜尋時候使用的PDF擷取器, 功能為將PDF內容抽出存成文字檔, 發布出來給大家參考。

首先要下載所需套件pdfBox: http://www.pdfbox.org/, 下載後將src/PDFBox-0.7.3.jarexternal/FontBox-0.1.0-dev.jar置放到classpath中。

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class PDFParser
{
    public void readPdf(String file) throws Exception
    {
        boolean sort = false;             // is sort.
        String pdfFile = file;            // pdf file name.
        String textFile = null;           // output file.
        String encoding = "UTF-8";        // encode type.
        int startPage = 1;                // parse start page no.
        int endPage = Integer.MAX_VALUE;  // parse end page no.
        Writer output = null;             // output writer.
        PDDocument document = null;       // PDF Document.
        try {
            URL url = new URL(pdfFile);   // create connection from source file.
            document = PDDocument.load(pdfFile);  // use PDDocument to load file.
            String fileName = url.getFile();  // get pdf's name.
            
            if (fileName.length() > 4) {  // name output file.
                File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt");
                textFile = outputFile.getName();
            }
            
            // create connection to output file.
            output = new OutputStreamWriter(new FileOutputStream(textFile), encoding);
            
            // use PDFTextStripper to get content.
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setSortByPosition(sort); // set sort.
            stripper.setStartPage(startPage); // set start page.
            stripper.setEndPage(endPage);     // set end page.
            
            // write to output file.
            stripper.writeText(document, output);

        } catch (MalformedURLException e) {
            e.printStackTrace();
        } finally {
            // close all stream.
            if (output != null) {
                output.close();
            }
            if (document != null) {
                document.close();
            }
        }
    }

    public static void main(String[] args)
    {
        PDFParser pdfReader = new PDFParser();
        try {
            // read pdf content.
            pdfReader.readPdf("place your pdf location here");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

沒有留言:

張貼留言