DefaultIndexer.java
/*-
* #%L
* io.earcam.utilitarian.site.search.offline
* %%
* Copyright (C) 2017 earcam
* %%
* SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
*
* You <b>must</b> choose to accept, in full - any individual or combination of
* the following licenses:
* <ul>
* <li><a href="https://opensource.org/licenses/BSD-3-Clause">BSD-3-Clause</a></li>
* <li><a href="https://www.eclipse.org/legal/epl-v10.html">EPL-1.0</a></li>
* <li><a href="https://www.apache.org/licenses/LICENSE-2.0">Apache-2.0</a></li>
* <li><a href="https://opensource.org/licenses/MIT">MIT</a></li>
* </ul>
* #L%
*/
package io.earcam.utilitarian.site.search.offline;
import static io.earcam.unexceptional.Closing.closeAfterAccepting;
import static io.earcam.unexceptional.Exceptional.apply;
import static io.earcam.utilitarian.site.search.offline.Component.getOrDefault;
import static io.earcam.utilitarian.site.search.offline.Component.mandatory;
import static io.earcam.utilitarian.site.search.offline.Javascript.createJavascriptEngine;
import static io.earcam.utilitarian.site.search.offline.Javascript.invokeFunction;
import static io.earcam.utilitarian.site.search.offline.Resources.SCRIPT_INDEX;
import static io.earcam.utilitarian.site.search.offline.Resources.SCRIPT_SEARCH;
import static io.earcam.utilitarian.site.search.offline.Resources.getResource;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Arrays.stream;
import static java.util.Collections.emptyMap;
import static java.util.function.Function.identity;
import static java.util.stream.Collectors.joining;
import static java.util.stream.Collectors.toMap;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.stream.Stream;
import java.util.zip.GZIPOutputStream;
import javax.annotation.WillNotClose;
import javax.script.Invocable;
import javax.script.ScriptException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
//TODO test maven SOURCE filtering to replace VERSION_* constants ... but will IDE/Eclipse do the replacement before test exec?
public class DefaultIndexer implements Indexer {
private static final Logger LOG = LoggerFactory.getLogger(DefaultIndexer.class);
public static final String BASEDIR_WEBJARS_RESOURCES = "META-INF/resources/webjars/";
public static final String VERSION_LUNR_JS = "2.1.0";
public static final String LUNR = BASEDIR_WEBJARS_RESOURCES + "lunr.js/" + VERSION_LUNR_JS + "/lunr.js";
public static final String OUTPUT_FILE = "outputFile";
public static final String FIELDS = "fields";
public static final String MAP_TITLES = "mapTitles";
public static final String GENERATE_AUTOCOMPLETE = "generateAutocomplete";
public static final String OUTPUT_CHARSET = "outputCharset";
private Path outputFile;
@SuppressWarnings("squid:S1845")
private String[] fields;
private String refUrl;
private Charset outputCharset;
private Map<String, String> titlesMap = new HashMap<>();
private SortedSet<String> autocomplete = new TreeSet<>();
private BiConsumer<String, String> titleMapper = titlesMap::put;
private Consumer<String> autocompleter = autocomplete::add;
private Invocable engine;
private Object javascriptIndexBuilder;
@Override
public void configure(Map<String, String> configuration)
{
outputCharset = getOrDefault(configuration, OUTPUT_CHARSET, UTF_8);
refUrl = mandatory(configuration, Document.REF_URL);
outputFile = Paths.get(mandatory(configuration, OUTPUT_FILE));
fields = mandatory(configuration, FIELDS).split(",");
if(!getOrDefault(configuration, MAP_TITLES, true)) {
titleMapper = (u, t) -> { /* noop */ };
}
if(!getOrDefault(configuration, GENERATE_AUTOCOMPLETE, true)) {
autocompleter = d -> { /* noop */ };
}
initialize();
}
private void initialize()
{
engine = createSearchEngine(SCRIPT_INDEX);
Map<String, Map<Object, Object>> fieldConfigurations = stream(fields).collect(toMap(identity(), v -> emptyMap()));
javascriptIndexBuilder = invokeFunction(engine, "createIndexBuilder", refUrl, fieldConfigurations);
}
static Invocable createSearchEngine(String script)
{
InputStream lunr = getResource(LUNR);
InputStream indexScript = getResource(script);
Objects.requireNonNull(lunr, "Could not load lunrjs lib");
Objects.requireNonNull(indexScript, "Could not load indexScript");
return createJavascriptEngine(lunr, indexScript);
}
@Override
public synchronized Indexer add(Stream<Document> documents)
{
invokeFunction(engine, "addDocuments", javascriptIndexBuilder, documents
.filter(Document::hasTokens)
.peek(d -> titleMapper.accept(d.refUrl(), d.title()))
.peek(d -> d.tokens().forEach(autocompleter::accept))
.map(Document::asMap)
.iterator());
return this;
}
@Override
public void writeJson()
{
outputFile.getParent().toFile().mkdirs();
if(isGzip()) {
FileOutputStream fos = apply(FileOutputStream::new, outputFile.toFile());
closeAfterAccepting(GZIPOutputStream::new, fos, this::writeJson);
} else {
closeAfterAccepting(FileOutputStream::new, outputFile.toFile(), this::writeJson);
}
}
private boolean isGzip()
{
return outputFile.getFileName().toString().endsWith(".gz");
}
protected void writeJson(@WillNotClose OutputStream output) throws IOException
{
writeIndex(output);
writeAutocomplete(output);
writeTitleMap(output);
}
private void writeIndex(OutputStream output) throws IOException
{
output.write(bytes("{\n\n\"index\": "));
String indexJson = serializeIndex();
byte[] bytes = bytes(indexJson);
output.write(bytes);
String id = id();
LOG.debug("{} wrote {} bytes for index to {}", id, bytes.length, outputFile);
}
public byte[] bytes(String text)
{
return text.getBytes(outputCharset);
}
public String serializeIndex()
{
return (String) invokeFunction(engine, "buildSerializedIndex", javascriptIndexBuilder);
}
private void writeAutocomplete(OutputStream output) throws IOException
{
byte[] bytes = bytes(autocomplete.stream().collect(joining("\", \"", ",\n\n\"autocomplete\": [\"", "\"]")));
output.write(bytes);
String id = id();
LOG.debug("{} wrote {} bytes for {} words for autocomplete to {}", id, bytes.length, autocomplete.size(), outputFile);
}
private void writeTitleMap(OutputStream output) throws IOException
{
byte[] bytes = bytes(titlesMap.entrySet().stream().map(
e -> new StringBuilder()
.append('"').append(e.getKey()).append('"')
.append(':')
.append('"').append(e.getValue()).append('"'))
.collect(joining(", ", ",\n\n\"titleMap\": {", "}\n}")));
output.write(bytes);
String id = id();
LOG.debug("{} wrote {} bytes for {} entries for title map to {}", id, bytes.length, titlesMap.size(), outputFile);
}
public static String search(String indexJson, String query) throws ScriptException, NoSuchMethodException
{
Invocable engine = createSearchEngine(SCRIPT_SEARCH);
return engine.invokeFunction("jsonSearchIndex", indexJson, query).toString();
}
}