1 | /*- | |
2 | * #%L | |
3 | * io.earcam.utilitarian.site.search.offline | |
4 | * %% | |
5 | * Copyright (C) 2017 earcam | |
6 | * %% | |
7 | * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT) | |
8 | * | |
9 | * You <b>must</b> choose to accept, in full - any individual or combination of | |
10 | * the following licenses: | |
11 | * <ul> | |
12 | * <li><a href="https://opensource.org/licenses/BSD-3-Clause">BSD-3-Clause</a></li> | |
13 | * <li><a href="https://www.eclipse.org/legal/epl-v10.html">EPL-1.0</a></li> | |
14 | * <li><a href="https://www.apache.org/licenses/LICENSE-2.0">Apache-2.0</a></li> | |
15 | * <li><a href="https://opensource.org/licenses/MIT">MIT</a></li> | |
16 | * </ul> | |
17 | * #L% | |
18 | */ | |
19 | package io.earcam.utilitarian.site.search.offline; | |
20 | ||
21 | import java.net.URI; | |
22 | import java.nio.file.Files; | |
23 | import java.nio.file.Path; | |
24 | import java.util.Map; | |
25 | import java.util.function.Predicate; | |
26 | import java.util.stream.Stream; | |
27 | ||
28 | import io.earcam.unexceptional.EmeticStream; | |
29 | ||
30 | // @formatter:off | |
31 | /** | |
32 | * Pipeline needs to be built according to definition, with names driven via SPI, e.g. | |
33 | * | |
34 | * configuration is just a Map<String, String> passed to each | |
35 | * | |
36 | * <pre> | |
37 | * <pipeline> | |
38 | * <filter> | |
39 | * <id>default-regex</id> | |
40 | * <configuration> | |
41 | * <include>regex</include> | |
42 | * <exclude>regex</exclude> | |
43 | * </configuration> | |
44 | * </filter> | |
45 | * <processor> | |
46 | * <id>default-html</id> | |
47 | * </processor> | |
48 | * <processor> | |
49 | * <id>default-pdf</id> | |
50 | * </processor> | |
51 | * <!-- ... filter based on content can go here ... --> | |
52 | * <processor> | |
53 | * <id>default-tokenizer</id> | |
54 | * </processor> | |
55 | * </pipeline> | |
56 | * </pre> | |
57 | * | |
58 | * Therefore Filter and Processor both need to extend 'Component' | |
59 | * | |
60 | * Component{ String id; void configure(Map<String, String>) } | |
61 | * | |
62 | * Filter imps Predicate<Document> | |
63 | * | |
64 | * Processor{ process(Document); } | |
65 | * | |
66 | * | |
67 | * HtmlContentParser{ } | |
68 | * | |
69 | * | |
70 | */ | |
71 | // @formatter:on | |
72 | public class Crawler { | |
73 | ||
74 | private Stream<Document> documents; | |
75 | ||
76 | ||
77 | public static Crawler crawler(Map<Path, URI> directories) | |
78 | { | |
79 | Crawler crawler = new Crawler(); | |
80 | crawler.documents = crawl(directories); | |
81 |
1
1. crawler : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::crawler to ( if (x != null) null else throw new RuntimeException ) → KILLED |
return crawler; |
82 | } | |
83 | ||
84 | ||
85 | private static Stream<Document> crawl(Map<Path, URI> directories) | |
86 | { | |
87 |
1
1. crawl : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::crawl to ( if (x != null) null else throw new RuntimeException ) → KILLED |
return directories.entrySet().parallelStream().flatMap(Crawler::crawl); |
88 | } | |
89 | ||
90 | ||
91 | private static Stream<Document> crawl(Map.Entry<Path, URI> pair) | |
92 | { | |
93 |
1
1. crawl : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::crawl to ( if (x != null) null else throw new RuntimeException ) → SURVIVED |
return crawl(pair.getKey(), pair.getValue()); |
94 | } | |
95 | ||
96 | ||
97 | private static Stream<Document> crawl(Path baseDir, URI baseUri) | |
98 | { | |
99 |
2
1. crawl : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::crawl to ( if (x != null) null else throw new RuntimeException ) → SURVIVED 2. lambda$crawl$0 : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::lambda$crawl$0 to ( if (x != null) null else throw new RuntimeException ) → KILLED |
return EmeticStream.emesis(Files::walk, baseDir) |
100 | .mapToStream() | |
101 |
1
1. lambda$crawl$1 : replaced return of integer sized value with (x == 0 ? 1 : 0) → SURVIVED |
.filter(Files::isRegularFile) |
102 |
1
1. lambda$crawl$2 : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::lambda$crawl$2 to ( if (x != null) null else throw new RuntimeException ) → KILLED |
.map(f -> Document.document(baseDir, baseUri, f)); |
103 | } | |
104 | ||
105 | ||
106 | public Crawler filter(Predicate<Document> filter) | |
107 | { | |
108 | documents = documents.filter(filter); | |
109 |
1
1. filter : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::filter to ( if (x != null) null else throw new RuntimeException ) → KILLED |
return this; |
110 | } | |
111 | ||
112 | ||
113 | public Crawler processor(Processor processor) | |
114 | { | |
115 | documents = documents.map(processor); | |
116 |
1
1. processor : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::processor to ( if (x != null) null else throw new RuntimeException ) → KILLED |
return this; |
117 | } | |
118 | ||
119 | ||
120 | public Stream<Document> documents() | |
121 | { | |
122 |
1
1. documents : mutated return of Object value for io/earcam/utilitarian/site/search/offline/Crawler::documents to ( if (x != null) null else throw new RuntimeException ) → KILLED |
return documents.filter(Document::hasTokens); |
123 | } | |
124 | } | |
Mutations | ||
81 |
1.1 |
|
87 |
1.1 |
|
93 |
1.1 |
|
99 |
1.1 2.2 |
|
101 |
1.1 |
|
102 |
1.1 |
|
109 |
1.1 |
|
116 |
1.1 |
|
122 |
1.1 |