PdfContentProcessor.java
/*-
* #%L
* io.earcam.utilitarian.site.search.offline
* %%
* Copyright (C) 2017 earcam
* %%
* SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
*
* You <b>must</b> choose to accept, in full - any individual or combination of
* the following licenses:
* <ul>
* <li><a href="https://opensource.org/licenses/BSD-3-Clause">BSD-3-Clause</a></li>
* <li><a href="https://www.eclipse.org/legal/epl-v10.html">EPL-1.0</a></li>
* <li><a href="https://www.apache.org/licenses/LICENSE-2.0">Apache-2.0</a></li>
* <li><a href="https://opensource.org/licenses/MIT">MIT</a></li>
* </ul>
* #L%
*/
package io.earcam.utilitarian.site.search.offline;
import java.io.IOException;
import java.io.UncheckedIOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.earcam.unexceptional.Closing;
public class PdfContentProcessor implements Processor {
private static final Logger LOG = LoggerFactory.getLogger(PdfContentProcessor.class);
static {
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
}
@Override
public void process(Document document)
{
if(isPdf(document) && !document.hasRaw()) {
try {
Closing.closeAfterAccepting(PDDocument::load, document.file().toFile(), document, this::consume);
} catch(UncheckedIOException e) {
LOG.warn("Failed to process PDF {} due to: {}", document.file(), e.getMessage());
LOG.debug("Failed to process PDF", e.getCause());
}
}
}
private void consume(PDDocument pdf, Document document) throws IOException
{
PDDocumentInformation information = pdf.getDocumentInformation();
document.field(Document.TITLE, information.getTitle());
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pdf);
document.field(Document.RAW_TEXT, text);
}
private boolean isPdf(Document document)
{
return "application/pdf".equals(document.contentType());
}
}