HtmlContentProcessor.java
/*-
* #%L
* io.earcam.utilitarian.site.search.offline
* %%
* Copyright (C) 2017 earcam
* %%
* SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
*
* You <b>must</b> choose to accept, in full - any individual or combination of
* the following licenses:
* <ul>
* <li><a href="https://opensource.org/licenses/BSD-3-Clause">BSD-3-Clause</a></li>
* <li><a href="https://www.eclipse.org/legal/epl-v10.html">EPL-1.0</a></li>
* <li><a href="https://www.apache.org/licenses/LICENSE-2.0">Apache-2.0</a></li>
* <li><a href="https://opensource.org/licenses/MIT">MIT</a></li>
* </ul>
* #L%
*/
package io.earcam.utilitarian.site.search.offline;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.FileInputStream;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HtmlContentProcessor extends AbstractHtmlProcessor {
private static final Logger LOG = LoggerFactory.getLogger(HtmlContentProcessor.class);
@Override
public void process(Document document)
{
if(isHtml(document) && !document.hasRaw()) {
org.jsoup.nodes.Document html;
try {
html = Jsoup.parse(new FileInputStream(document.file().toFile()), UTF_8.toString(), "");
assignFields(document, html);
} catch(IOException e) {
LOG.warn("Failed to process HTML {} due to: {}", document.file(), e.getMessage());
LOG.debug("Failed to process HTML", e);
}
}
}
private void assignFields(Document document, org.jsoup.nodes.Document html)
{
document.field(Document.TITLE, html.getElementsByTag("title").text());
document.field(Document.DESCRIPTION, html.getElementsByTag("meta").select("[name=description]").attr("content"));
document.field(Document.RAW_TEXT, html.getElementsByTag("h1").text() + ' ' +
html.getElementsByTag("h2").text() + ' ' +
html.getElementsByTag("h3").text() + ' ' +
html.getElementsByTag("h4").text() + ' ' +
html.getElementsByTag("h5").text() + ' ' +
html.getElementsByTag("h6").text() + ' ' +
html.getElementsByTag("p").text());
}
}