SimpleTokenizer.java

/*-
 * #%L
 * io.earcam.utilitarian.site.search.offline
 * %%
 * Copyright (C) 2017 earcam
 * %%
 * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
 *
 * You <b>must</b> choose to accept, in full - any individual or combination of
 * the following licenses:
 * <ul>
 * 	<li><a href="https://opensource.org/licenses/BSD-3-Clause">BSD-3-Clause</a></li>
 * 	<li><a href="https://www.eclipse.org/legal/epl-v10.html">EPL-1.0</a></li>
 * 	<li><a href="https://www.apache.org/licenses/LICENSE-2.0">Apache-2.0</a></li>
 * 	<li><a href="https://opensource.org/licenses/MIT">MIT</a></li>
 * </ul>
 * #L%
 */
package io.earcam.utilitarian.site.search.offline;

import static java.util.Collections.emptyList;

import java.io.IOException;
import java.io.StringReader;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.earcam.unexceptional.Closing;

public class SimpleTokenizer implements Processor {

	private static final Logger LOG = LoggerFactory.getLogger(SimpleTokenizer.class);


	public List<String> tokenize(String input)
	{
		try {
			return Closing.closeAfterApplying(createAnalyzer(), input, this::tokens);
		} catch(UncheckedIOException e) {
			LOG.warn("Failed to tokenize '{}', due to {}", input, e.getMessage());
		}
		return emptyList();
	}


	private List<String> tokens(Analyzer analyzer, String input)
	{
		return Closing.closeAfterApplying(analyzer.tokenStream(null, new StringReader(input)), this::streamTokens);
	}


	private List<String> streamTokens(TokenStream stream) throws IOException
	{
		stream.reset();
		List<String> tokens = new ArrayList<>();
		while(stream.incrementToken()) {
			tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
		}
		return tokens;
	}


	/**
	 * <p>
	 * Override this method to return a custom {@link Analyzer}.
	 * </p>
	 * <p>
	 * Note; Use of Lucene for stemming, stopword filtering, etc must match
	 * whatever is configured for lunrjs.
	 * </p>
	 *
	 * @return an {@link Analyzer} for tokenizing
	 */
	protected Analyzer createAnalyzer()
	{
		return new SimpleAnalyzer();
	}


	@Override
	public void process(Document document)
	{
		if(document.hasRaw() && !document.hasTokens()) {
			List<String> tokenized = tokenize(document.raw());
			document.tokens().addAll(tokenized);
		}
	}
}