Document.java
/*-
* #%L
* io.earcam.utilitarian.site.search.offline
* %%
* Copyright (C) 2017 earcam
* %%
* SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
*
* You <b>must</b> choose to accept, in full - any individual or combination of
* the following licenses:
* <ul>
* <li><a href="https://opensource.org/licenses/BSD-3-Clause">BSD-3-Clause</a></li>
* <li><a href="https://www.eclipse.org/legal/epl-v10.html">EPL-1.0</a></li>
* <li><a href="https://www.apache.org/licenses/LICENSE-2.0">Apache-2.0</a></li>
* <li><a href="https://opensource.org/licenses/MIT">MIT</a></li>
* </ul>
* #L%
*/
package io.earcam.utilitarian.site.search.offline;
import static java.util.stream.Collectors.joining;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.UncheckedIOException;
import java.net.URI;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import io.earcam.unexceptional.Closing;
import io.earcam.unexceptional.Exceptional;
//make this XmlRootElement - get Maven to pump config, copy 'n' paste for "main runner"
public class Document {
public static final String LOCAL_FILE = "file";
public static final String REF_URL = "url";
public static final String RAW_TEXT = "raw";
public static final String TEXT = "text";
public static final String TITLE = "title";
public static final String DESCRIPTION = "description";
public static final String CONTENT_TYPE = "contentType";
private final Map<String, String> fields = new HashMap<>();
private final List<String> tokens = new ArrayList<>();
private final Path file;
private final String refUrl;
private Document(Path file, String refUrl)
{
this.file = file;
this.refUrl = refUrl;
}
public Path file()
{
return file;
}
public String raw()
{
return fields.getOrDefault(RAW_TEXT, "");
}
public boolean hasRaw()
{
return fields.containsKey(RAW_TEXT);
}
public List<String> tokens()
{
return tokens;
}
public boolean hasTokens()
{
return !tokens.isEmpty();
}
public String refUrl()
{
return refUrl;
}
@SuppressWarnings("squid:S1845")
public String title()
{
return fields.getOrDefault(TITLE, "");
}
public void field(String key, String value)
{
fields.put(key, value);
}
public String field(String key)
{
return fields.getOrDefault(key, "");
}
public String contentType()
{
return fields.getOrDefault(CONTENT_TYPE, "");
}
public static Document document(Path baseDir, URI baseUri, Path file)
{
String relativeUri = ensureTrailingSlash(baseUri, true);
String refUrl = relativizeReferenceUri(baseDir, relativeUri, file);
Document document = new Document(file, refUrl);
deduceContentType(file, document);
return document;
}
private static void deduceContentType(Path file, Document document)
{
String contentType = null;
Iterator<Function<Path, String>> detectors = Arrays.<Function<Path, String>> asList(
Document::probeContentType,
Document::guessContentTypeFromStream,
Document::guessContentTypeFromName).iterator();
try {
while(contentType == null && detectors.hasNext()) {
contentType = detectors.next().apply(file);
}
} catch(UncheckedIOException e) {
Exceptional.swallow(e);
}
document.field(CONTENT_TYPE, contentType);
}
private static String probeContentType(Path file)
{
return Exceptional.apply(Files::probeContentType, file);
}
private static String guessContentTypeFromStream(Path file)
{
BufferedInputStream input = new BufferedInputStream(Exceptional.apply(FileInputStream::new, file.toFile()));
return Closing.closeAfterApplying(input, URLConnection::guessContentTypeFromStream);
}
private static String guessContentTypeFromName(Path file)
{
return URLConnection.guessContentTypeFromName(file.getFileName().toString());
}
public static String relativizeReferenceUri(Path baseDir, String baseUri, Path file)
{
return baseUri + baseDir.toUri().relativize(file.toUri()).toString();
}
private static String ensureTrailingSlash(URI uri, boolean uriPathOnly)
{
String yuri = uriPathOnly ? uri.getPath() : uri.toString();
return (yuri.length() > 0 && yuri.charAt(yuri.length() - 1) == '/') ? yuri : (yuri + '/');
}
public Map<String, String> asMap()
{
HashMap<String, String> map = new HashMap<>(fields);
map.put(REF_URL, refUrl);
map.put(TEXT, tokens.stream().collect(joining(" ")));
return map;
}
}