At the same time, i was exploring pdf document parsing with apache.pdfbox framework.
At that time, i wrote sample code to see the PDF box capabilities.
Exactly last year, my niece asked some help in parsing the PDF file.
I fixed the java file for her need. Keeping it here for any later uses.
import java.io.*;
import java.time.Duration;
import java.time.Instant;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PdfToCsvGenerator {
public static void main(String[] args) {
String string = null;
BufferedWriter out = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
PDFTextStripper pdfStripper = null;
try {
System.out.println("Processing PDF file. please wait.");
Instant start = Instant.now();
PDFParser parser = new PDFParser(new FileInputStream("C:\\xxx\files\\final_result.pdf")); //TODO
parser.parse();
cosDoc = parser.getDocument();
FileWriter fstream = new FileWriter("C:\\xxx\\files\\out.csv"); ///TODO this is output file
out = new BufferedWriter(fstream);
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdfStripper.setStartPage(1);
pdfStripper.setEndPage(3); //TODO for now parse only first two pages. once everything is ready..extend it for all
String parsedText = pdfStripper.getText(pdDoc);
String lineSep = pdfStripper.getLineSeparator();
String[] lines = parsedText.split(lineSep);
for ( String aLine : lines){
if ( aLine!= null){
String trimLine = aLine.trim();
if (trimLine.matches("^[A-Z].*$")) {
System.out.println("Ignoring input line:"+trimLine);
}else{
String htno=null,subcode=null, subname=null, internal=null, ext=null, credit = null;
StringBuffer buf = new StringBuffer();
if (trimLine.contains(" ")){
String[] fields = trimLine.split(" ");
boolean isSubject = true;
int i = 0;
for ( String s : fields) {
if (i == 0) {
htno = s;
i++;
continue;
}
if (i == 1) {
subcode = s;
i++;
continue;
}
if (isSubject == true) {
//StringBuffer buf = new StringBuffer();
if (s.matches("[A-Za-z-&/]+")) {
buf.append(s+" ");
i++;
continue;
} else {
subname = buf.toString();
isSubject = false;
internal = s;
i++;
continue;
}
}
ext = s;
credit = fields[fields.length - 1];
break;
}
out.write(htno + "," + subcode + "," + subname + "," + internal + "," + ext + "," + credit + "\n");
// System.out.println("htno:" + htno + " subcode:" + subcode + " subject name:" + subname + " internal:" + internal + " ext:" + ext + " credit:" + credit);
}else{
System.out.println("Ignoring 2ed stage input line:"+trimLine);
}
}
}
}
if (out != null) {
out.flush();
out.close();
}
System.out.println("processing is complete. Check output file.");
Instant end = Instant.now();
Duration timeElapsed = Duration.between(start, end);
System.out.println("Time taken: "+ timeElapsed.getSeconds() +" seconds");
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
}
}
}
No comments:
Post a Comment