1 | /*- | |
2 | * #%L | |
3 | * io.earcam.utilitarian.site.search.offline | |
4 | * %% | |
5 | * Copyright (C) 2017 earcam | |
6 | * %% | |
7 | * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT) | |
8 | * | |
9 | * You <b>must</b> choose to accept, in full - any individual or combination of | |
10 | * the following licenses: | |
11 | * <ul> | |
12 | * <li><a href="https://opensource.org/licenses/BSD-3-Clause">BSD-3-Clause</a></li> | |
13 | * <li><a href="https://www.eclipse.org/legal/epl-v10.html">EPL-1.0</a></li> | |
14 | * <li><a href="https://www.apache.org/licenses/LICENSE-2.0">Apache-2.0</a></li> | |
15 | * <li><a href="https://opensource.org/licenses/MIT">MIT</a></li> | |
16 | * </ul> | |
17 | * #L% | |
18 | */ | |
19 | package io.earcam.utilitarian.site.search.offline; | |
20 | ||
21 | import static java.nio.charset.StandardCharsets.UTF_8; | |
22 | ||
23 | import java.io.FileInputStream; | |
24 | import java.io.IOException; | |
25 | ||
26 | import org.jsoup.Jsoup; | |
27 | import org.slf4j.Logger; | |
28 | import org.slf4j.LoggerFactory; | |
29 | ||
30 | public class HtmlContentProcessor extends AbstractHtmlProcessor { | |
31 | ||
32 | private static final Logger LOG = LoggerFactory.getLogger(HtmlContentProcessor.class); | |
33 | ||
34 | ||
35 | @Override | |
36 | public void process(Document document) | |
37 | { | |
38 |
2
1. process : negated conditional → SURVIVED 2. process : negated conditional → SURVIVED |
if(isHtml(document) && !document.hasRaw()) { |
39 | org.jsoup.nodes.Document html; | |
40 | try { | |
41 | html = Jsoup.parse(new FileInputStream(document.file().toFile()), UTF_8.toString(), ""); | |
42 |
1
1. process : removed call to io/earcam/utilitarian/site/search/offline/HtmlContentProcessor::assignFields → SURVIVED |
assignFields(document, html); |
43 | } catch(IOException e) { | |
44 | LOG.warn("Failed to process HTML {} due to: {}", document.file(), e.getMessage()); | |
45 | LOG.debug("Failed to process HTML", e); | |
46 | } | |
47 | } | |
48 | } | |
49 | ||
50 | ||
51 | private void assignFields(Document document, org.jsoup.nodes.Document html) | |
52 | { | |
53 |
1
1. assignFields : removed call to io/earcam/utilitarian/site/search/offline/Document::field → SURVIVED |
document.field(Document.TITLE, html.getElementsByTag("title").text()); |
54 |
1
1. assignFields : removed call to io/earcam/utilitarian/site/search/offline/Document::field → SURVIVED |
document.field(Document.DESCRIPTION, html.getElementsByTag("meta").select("[name=description]").attr("content")); |
55 | ||
56 |
1
1. assignFields : removed call to io/earcam/utilitarian/site/search/offline/Document::field → SURVIVED |
document.field(Document.RAW_TEXT, html.getElementsByTag("h1").text() + ' ' + |
57 | html.getElementsByTag("h2").text() + ' ' + | |
58 | html.getElementsByTag("h3").text() + ' ' + | |
59 | html.getElementsByTag("h4").text() + ' ' + | |
60 | html.getElementsByTag("h5").text() + ' ' + | |
61 | html.getElementsByTag("h6").text() + ' ' + | |
62 | html.getElementsByTag("p").text()); | |
63 | } | |
64 | } | |
Mutations | ||
38 |
1.1 2.2 |
|
42 |
1.1 |
|
53 |
1.1 |
|
54 |
1.1 |
|
56 |
1.1 |