import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.IOException; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; /** * PDFTextParser -- processes the PDFs into text with apache pdf box * * @author Alex Mousavi and Kyle Jorgensen * @version Choice Points Project1 for CS56, Spring 2011 * @see PDFTextParserTest */ //class PDFTextParser adapted from http://thottingal.in/blog/2009/06/24/pdfbox-extract-text-from-pdf/ public class PDFTextParser { // Extract text from PDF Document static String pdftoText(String fileName) { PDFParser parser; String parsedText = null;; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(5); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err .println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; } // Extract text from PDF Document static String pdftoText(InputStream is) { PDFParser parser; String parsedText = null;; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; try { parser = new PDFParser(is); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(5); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err .println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { e.printStackTrace(); } } return parsedText; } }