1 | /*- | |
2 | * #%L | |
3 | * io.earcam.utilitarian.site.search.offline | |
4 | * %% | |
5 | * Copyright (C) 2017 earcam | |
6 | * %% | |
7 | * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT) | |
8 | * | |
9 | * You <b>must</b> choose to accept, in full - any individual or combination of | |
10 | * the following licenses: | |
11 | * <ul> | |
12 | * <li><a href="https://opensource.org/licenses/BSD-3-Clause">BSD-3-Clause</a></li> | |
13 | * <li><a href="https://www.eclipse.org/legal/epl-v10.html">EPL-1.0</a></li> | |
14 | * <li><a href="https://www.apache.org/licenses/LICENSE-2.0">Apache-2.0</a></li> | |
15 | * <li><a href="https://opensource.org/licenses/MIT">MIT</a></li> | |
16 | * </ul> | |
17 | * #L% | |
18 | */ | |
19 | package io.earcam.utilitarian.site.search.offline; | |
20 | ||
21 | import java.io.IOException; | |
22 | import java.io.UncheckedIOException; | |
23 | ||
24 | import org.apache.pdfbox.pdmodel.PDDocument; | |
25 | import org.apache.pdfbox.pdmodel.PDDocumentInformation; | |
26 | import org.apache.pdfbox.text.PDFTextStripper; | |
27 | import org.slf4j.Logger; | |
28 | import org.slf4j.LoggerFactory; | |
29 | ||
30 | import io.earcam.unexceptional.Closing; | |
31 | ||
32 | public class PdfContentProcessor implements Processor { | |
33 | ||
34 | private static final Logger LOG = LoggerFactory.getLogger(PdfContentProcessor.class); | |
35 | ||
36 | static { | |
37 | System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider"); | |
38 | } | |
39 | ||
40 | ||
41 | @Override | |
42 | public void process(Document document) | |
43 | { | |
44 |
2
1. process : negated conditional → KILLED 2. process : negated conditional → KILLED |
if(isPdf(document) && !document.hasRaw()) { |
45 | ||
46 | try { | |
47 |
1
1. process : removed call to io/earcam/unexceptional/Closing::closeAfterAccepting → KILLED |
Closing.closeAfterAccepting(PDDocument::load, document.file().toFile(), document, this::consume); |
48 | } catch(UncheckedIOException e) { | |
49 | LOG.warn("Failed to process PDF {} due to: {}", document.file(), e.getMessage()); | |
50 | LOG.debug("Failed to process PDF", e.getCause()); | |
51 | } | |
52 | } | |
53 | } | |
54 | ||
55 | ||
56 | private void consume(PDDocument pdf, Document document) throws IOException | |
57 | { | |
58 | PDDocumentInformation information = pdf.getDocumentInformation(); | |
59 |
1
1. consume : removed call to io/earcam/utilitarian/site/search/offline/Document::field → KILLED |
document.field(Document.TITLE, information.getTitle()); |
60 | ||
61 | PDFTextStripper stripper = new PDFTextStripper(); | |
62 | String text = stripper.getText(pdf); | |
63 |
1
1. consume : removed call to io/earcam/utilitarian/site/search/offline/Document::field → KILLED |
document.field(Document.RAW_TEXT, text); |
64 | } | |
65 | ||
66 | ||
67 | private boolean isPdf(Document document) | |
68 | { | |
69 |
1
1. isPdf : replaced return of integer sized value with (x == 0 ? 1 : 0) → KILLED |
return "application/pdf".equals(document.contentType()); |
70 | } | |
71 | } | |
Mutations | ||
44 |
1.1 2.2 |
|
47 |
1.1 |
|
59 |
1.1 |
|
63 |
1.1 |
|
69 |
1.1 |