Skip to content

Commit cc8cae3

Browse files
Merge pull request #62 from icatproject/61_location_analyzers
Expand location into location, location.exact, location.fileName
2 parents 1d2fcaa + 7b35542 commit cc8cae3

File tree

8 files changed

+249
-114
lines changed

8 files changed

+249
-114
lines changed

src/main/java/org/icatproject/lucene/DocumentMapping.java

Lines changed: 93 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
package org.icatproject.lucene;
22

3-
import java.util.Arrays;
43
import java.util.HashMap;
5-
import java.util.HashSet;
64
import java.util.Map;
75
import java.util.Set;
86

97
import org.apache.lucene.analysis.Analyzer;
8+
import org.apache.lucene.analysis.core.KeywordAnalyzer;
9+
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
1010
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
1111
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler;
1212
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys;
13+
import org.icatproject.lucene.analyzers.IcatSeparatorAnalyzer;
14+
import org.icatproject.lucene.analyzers.IcatSynonymAnalyzer;
1315

1416
public class DocumentMapping {
1517

@@ -26,121 +28,113 @@ public static class ParentRelationship {
2628
* @param parentName Name of the parent entity.
2729
* @param joiningField Field that joins the child to its parent.
2830
* @param cascadeDelete If the child is deleted, whether the parent onto which
29-
* it is nested should be deleted wholesale or just have
30-
* its fields pruned.
31-
* @param fields Fields that should be updated by this relationship where
32-
* the field is the same on parent and child.
31+
* it is nested should be deleted wholesale or just have
32+
* its fields pruned.
33+
* @param fieldMapping Fields that should be updated by this relationship. The
34+
* key and value will be the same for most fields, but for
35+
* some they will differ to allow fields to be flattened
36+
* across entities (e.g. dataset.name: name).
3337
*/
34-
public ParentRelationship(String parentName, String joiningField, boolean cascadeDelete, String... fields) {
38+
public ParentRelationship(String parentName, String joiningField, boolean cascadeDelete,
39+
Map<String, String> fieldMapping) {
3540
this.parentName = parentName;
3641
this.joiningField = joiningField;
3742
this.cascadeDelete = cascadeDelete;
38-
fieldMapping = new HashMap<>();
39-
for (String field : fields) {
40-
fieldMapping.put(field, field);
41-
}
42-
}
43-
44-
/**
45-
* @param parentField Name on the parent, such as "dataset.name"
46-
* @param childField Name on the child, such as "name"
47-
*/
48-
public void mapField(String parentField, String childField) {
49-
fieldMapping.put(parentField, childField);
43+
this.fieldMapping = fieldMapping;
5044
}
5145
}
5246

53-
private static Analyzer analyzer = new IcatSynonymAnalyzer();;
54-
55-
public static final Set<String> doubleFields = new HashSet<>();
56-
public static final Set<String> longFields = new HashSet<>();
57-
public static final Set<String> sortFields = new HashSet<>();
58-
public static final Set<String> textFields = new HashSet<>();
59-
public static final Set<String> indexedEntities = new HashSet<>();
60-
public static final Map<String, ParentRelationship[]> relationships = new HashMap<>();
47+
public static final Set<String> doubleFields = Set.of("numericValue", "numericValueSI", "rangeTop", "rangeTopSI",
48+
"rangeBottom", "rangeBottomSI");
49+
public static final Set<String> longFields = Set.of("date", "startDate", "endDate", "dateTimeValue",
50+
"investigation.startDate", "fileSize", "fileCount", "datafile.id", "datafileFormat.id", "dataset.id",
51+
"facility.id", "facilityCycle.id", "investigation.id", "instrument.id", "id", "sample.id",
52+
"sample.investigation.id", "sample.type.id", "technique.id", "type.id", "user.id");
53+
public static final Set<String> sortFields = Set.of("datafile.id", "datafileFormat.id", "dataset.id", "facility.id",
54+
"facilityCycle.id", "investigation.id", "instrument.id", "id", "sample.id", "sample.investigation.id",
55+
"technique.id", "type.id", "user.id", "date", "name", "stringValue", "dateTimeValue", "numericValue",
56+
"numericValueSI", "fileSize", "fileCount");
57+
public static final Set<String> textFields = Set.of("name", "visitId", "description", "dataset.name",
58+
"investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name",
59+
"sample.type.name", "technique.name", "technique.description", "technique.pid", "title", "summary",
60+
"facility.name", "user.fullName", "type.name", "doi");
61+
public static final Set<String> pathFields = Set.of("location");
62+
public static final Set<String> indexedEntities = Set.of("Datafile", "Dataset", "Investigation",
63+
"DatafileParameter", "DatasetParameter", "DatasetTechnique", "InstrumentScientist",
64+
"InvestigationFacilityCycle", "InvestigationInstrument", "InvestigationParameter", "InvestigationUser",
65+
"Sample", "SampleParameter");
66+
public static final Map<String, ParentRelationship[]> relationships = Map.ofEntries(
67+
Map.entry("Instrument", new ParentRelationship[] {
68+
new ParentRelationship("InvestigationInstrument", "instrument.id", true,
69+
Map.of("instrument.name", "instrument.name", "instrument.fullName", "instrument.fullName")) }),
70+
Map.entry("User", new ParentRelationship[] {
71+
new ParentRelationship("InvestigationUser", "user.id", true,
72+
Map.of("user.name", "user.name", "user.fullName", "user.fullName")),
73+
new ParentRelationship("InstrumentScientist", "user.id", true,
74+
Map.of("user.name", "user.name", "user.fullName", "user.fullName")) }),
75+
Map.entry("Sample", new ParentRelationship[] {
76+
new ParentRelationship("Dataset", "sample.id", false,
77+
Map.of("sample.name", "sample.name", "sample.investigation.id", "sample.investigation.id")),
78+
new ParentRelationship("Datafile", "sample.id", false,
79+
Map.of("sample.name", "sample.name", "sample.investigation.id", "sample.investigation.id")) }),
80+
Map.entry("SampleType", new ParentRelationship[] {
81+
new ParentRelationship("Sample", "type.id", true, Map.of("type.name", "type.name")),
82+
new ParentRelationship("Dataset", "sample.type.id", false,
83+
Map.of("sample.type.name", "sample.type.name")),
84+
new ParentRelationship("Datafile", "sample.type.id", false,
85+
Map.of("sample.type.name", "sample.type.name")) }),
86+
Map.entry("InvestigationType", new ParentRelationship[] {
87+
new ParentRelationship("Investigation", "type.id", true, Map.of("type.name", "type.name")) }),
88+
Map.entry("DatasetType", new ParentRelationship[] {
89+
new ParentRelationship("Dataset", "type.id", true, Map.of("type.name", "type.name")) }),
90+
Map.entry("DatafileFormat", new ParentRelationship[] {
91+
new ParentRelationship("Datafile", "datafileFormat.id", false,
92+
Map.of("datafileFormat.name", "datafileFormat.name")) }),
93+
Map.entry("Facility", new ParentRelationship[] {
94+
new ParentRelationship("Investigation", "facility.id", true,
95+
Map.of("facility.name", "facility.name")) }),
96+
Map.entry("ParameterType", new ParentRelationship[] {
97+
new ParentRelationship("DatafileParameter", "type.id", true,
98+
Map.of("type.name", "type.name", "type.units", "type.units")),
99+
new ParentRelationship("DatasetParameter", "type.id", true,
100+
Map.of("type.name", "type.name", "type.units", "type.units")),
101+
new ParentRelationship("InvestigationParameter", "type.id", true,
102+
Map.of("type.name", "type.name", "type.units", "type.units")),
103+
new ParentRelationship("SampleParameter", "type.id", true,
104+
Map.of("type.name", "type.name", "type.units", "type.units")) }),
105+
Map.entry("Technique", new ParentRelationship[] {
106+
new ParentRelationship("DatasetTechnique", "technique.id", true,
107+
Map.of("technique.name", "technique.name", "technique.description", "technique.description",
108+
"technique.pid", "technique.pid")) }),
109+
Map.entry("Investigation", new ParentRelationship[] {
110+
new ParentRelationship("Dataset", "investigation.id", true,
111+
Map.of("visitId", "visitId", "investigation.name", "name", "investigation.title", "title",
112+
"investigation.startDate", "startDate")),
113+
new ParentRelationship("Datafile", "investigation.id", true,
114+
Map.of("visitId", "visitId", "investigation.name", "name")) }),
115+
Map.entry("Dataset", new ParentRelationship[] {
116+
new ParentRelationship("Datafile", "dataset.id", true, Map.of("dataset.name", "name")) }));
61117

62118
public static final StandardQueryParser genericParser = buildParser();
63119
public static final StandardQueryParser datafileParser = buildParser("name", "description", "location",
64-
"datafileFormat.name", "visitId", "sample.name", "sample.type.name", "doi");
120+
"location.fileName", "datafileFormat.name", "visitId", "sample.name", "sample.type.name", "doi");
65121
public static final StandardQueryParser datasetParser = buildParser("name", "description", "sample.name",
66122
"sample.type.name", "type.name", "visitId", "doi");
67123
public static final StandardQueryParser investigationParser = buildParser("name", "visitId", "title", "summary",
68124
"facility.name", "type.name", "doi");
69125
public static final StandardQueryParser sampleParser = buildParser("sample.name", "sample.type.name");
70126

71-
static {
72-
doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI", "rangeTop", "rangeTopSI", "rangeBottom",
73-
"rangeBottomSI"));
74-
longFields.addAll(
75-
Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize",
76-
"fileCount", "datafile.id", "datafileFormat.id", "dataset.id", "facility.id",
77-
"facilityCycle.id", "investigation.id", "instrument.id", "id", "sample.id",
78-
"sample.investigation.id", "sample.type.id", "technique.id", "type.id", "user.id"));
79-
sortFields.addAll(
80-
Arrays.asList("datafile.id", "datafileFormat.id", "dataset.id", "facility.id", "facilityCycle.id",
81-
"investigation.id", "instrument.id", "id", "sample.id", "sample.investigation.id",
82-
"technique.id", "type.id", "user.id", "date", "name", "stringValue", "dateTimeValue",
83-
"numericValue", "numericValueSI", "fileSize", "fileCount"));
84-
textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name",
85-
"investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name",
86-
"sample.type.name", "technique.name", "technique.description", "technique.pid", "title", "summary",
87-
"facility.name", "user.fullName", "type.name", "doi"));
88-
89-
indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter",
90-
"DatasetParameter", "DatasetTechnique", "InstrumentScientist", "InvestigationFacilityCycle",
91-
"InvestigationInstrument", "InvestigationParameter", "InvestigationUser", "Sample", "SampleParameter"));
92-
93-
relationships.put("Instrument", new ParentRelationship[] {
94-
new ParentRelationship("InvestigationInstrument", "instrument.id", true, "instrument.name",
95-
"instrument.fullName") });
96-
relationships.put("User", new ParentRelationship[] {
97-
new ParentRelationship("InvestigationUser", "user.id", true, "user.name", "user.fullName"),
98-
new ParentRelationship("InstrumentScientist", "user.id", true, "user.name", "user.fullName") });
99-
relationships.put("Sample", new ParentRelationship[] {
100-
new ParentRelationship("Dataset", "sample.id", false, "sample.name", "sample.investigation.id"),
101-
new ParentRelationship("Datafile", "sample.id", false, "sample.name", "sample.investigation.id") });
102-
relationships.put("SampleType", new ParentRelationship[] {
103-
new ParentRelationship("Sample", "type.id", true, "type.name"),
104-
new ParentRelationship("Dataset", "sample.type.id", false, "sample.type.name"),
105-
new ParentRelationship("Datafile", "sample.type.id", false, "sample.type.name") });
106-
relationships.put("InvestigationType",
107-
new ParentRelationship[] { new ParentRelationship("Investigation", "type.id", true, "type.name") });
108-
relationships.put("DatasetType",
109-
new ParentRelationship[] { new ParentRelationship("Dataset", "type.id", true, "type.name") });
110-
relationships.put("DatafileFormat",
111-
new ParentRelationship[] {
112-
new ParentRelationship("Datafile", "datafileFormat.id", false, "datafileFormat.name") });
113-
relationships.put("Facility",
114-
new ParentRelationship[] { new ParentRelationship("Investigation", "facility.id", true, "facility.name") });
115-
relationships.put("ParameterType",
116-
new ParentRelationship[] {
117-
new ParentRelationship("DatafileParameter", "type.id", true, "type.name", "type.units"),
118-
new ParentRelationship("DatasetParameter", "type.id", true,"type.name", "type.units"),
119-
new ParentRelationship("InvestigationParameter", "type.id", true, "type.name", "type.units"),
120-
new ParentRelationship("SampleParameter", "type.id", true, "type.name", "type.units") });
121-
relationships.put("Technique",
122-
new ParentRelationship[] { new ParentRelationship("DatasetTechnique", "technique.id", true,"technique.name",
123-
"technique.description", "technique.pid") });
124-
125-
ParentRelationship investigationDatasetRelationship = new ParentRelationship("Dataset", "investigation.id",
126-
true, "visitId");
127-
investigationDatasetRelationship.mapField("investigation.name", "name");
128-
investigationDatasetRelationship.mapField("investigation.title", "title");
129-
investigationDatasetRelationship.mapField("investigation.startDate", "startDate");
130-
ParentRelationship investigationDatafileRelationship = new ParentRelationship("Datafile", "investigation.id",
131-
true,"visitId");
132-
investigationDatafileRelationship.mapField("investigation.name", "name");
133-
relationships.put("Investigation", new ParentRelationship[] {investigationDatasetRelationship, investigationDatafileRelationship });
134-
135-
ParentRelationship datasetDatafileRelationship = new ParentRelationship("Datafile", "dataset.id", true);
136-
datasetDatafileRelationship.mapField("dataset.name", "name");
137-
relationships.put("Dataset", new ParentRelationship[] { datasetDatafileRelationship });
138-
}
139-
140127
private static StandardQueryParser buildParser(String... defaultFields) {
141-
StandardQueryParser parser = new StandardQueryParser();
128+
HashMap<String, Analyzer> analyzerMap = new HashMap<>();
129+
for (String pathField : pathFields) {
130+
analyzerMap.put(pathField, new IcatSeparatorAnalyzer("/"));
131+
analyzerMap.put(pathField + ".exact", new KeywordAnalyzer());
132+
analyzerMap.put(pathField + ".fileName", new IcatSeparatorAnalyzer("."));
133+
}
134+
PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(new IcatSynonymAnalyzer(), analyzerMap);
135+
StandardQueryParser parser = new StandardQueryParser(analyzerWrapper);
136+
142137
StandardQueryConfigHandler qpConf = (StandardQueryConfigHandler) parser.getQueryConfigHandler();
143-
qpConf.set(ConfigurationKeys.ANALYZER, analyzer);
144138
qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true);
145139
if (defaultFields.length > 0) {
146140
qpConf.set(ConfigurationKeys.MULTI_FIELDS, defaultFields);

src/main/java/org/icatproject/lucene/Field.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
* Wrapper for the name, value and type (String/Text, long, double) of a field
2323
* to be added to a Lucene Document.
2424
*/
25-
class Field {
25+
public class Field {
2626

2727
private abstract class InnerField {
2828

@@ -58,6 +58,13 @@ public void addToDocument(Document document) throws NumberFormatException {
5858

5959
if (DocumentMapping.textFields.contains(name)) {
6060
document.add(new TextField(name, value, Store.YES));
61+
} else if (DocumentMapping.pathFields.contains(name)) {
62+
document.add(new TextField(name, value, Store.YES));
63+
document.add(new TextField(name + ".exact", value, Store.NO));
64+
int index = value.lastIndexOf("/");
65+
if (index != -1) {
66+
document.add(new TextField(name + ".fileName", value.substring(index + 1), Store.NO));
67+
}
6168
} else {
6269
document.add(new StringField(name, value, Store.YES));
6370
}
@@ -139,6 +146,19 @@ public Field(JsonObject object, String key, String name, List<String> facetField
139146
}
140147
}
141148

149+
/**
150+
* Creates a wrapper for a String Field.
151+
*
152+
* @param name Name of the field to be used on the Document
153+
* @param value String value of the field
154+
* @param facetFields List of String field names which should be stored as a facetable keyword
155+
*/
156+
public Field(String name, String value, List<String> facetFields) {
157+
this.name = name;
158+
facetable = facetFields.contains(name);
159+
innerField = new InnerStringField(value);
160+
}
161+
142162
/**
143163
* Creates a wrapper for a Field.
144164
*

src/main/java/org/icatproject/lucene/Lucene.java

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@
4545
import jakarta.ws.rs.core.Context;
4646
import jakarta.ws.rs.core.MediaType;
4747

48+
import org.apache.lucene.analysis.Analyzer;
49+
import org.apache.lucene.analysis.custom.CustomAnalyzer;
50+
import org.apache.lucene.analysis.custom.CustomAnalyzer.Builder;
51+
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
52+
import org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory;
4853
import org.apache.lucene.document.Document;
4954
import org.apache.lucene.document.DoublePoint;
5055
import org.apache.lucene.document.Field.Store;
@@ -88,6 +93,8 @@
8893
import org.apache.lucene.util.NumericUtils;
8994
import org.icatproject.lucene.DocumentMapping.ParentRelationship;
9095
import org.icatproject.lucene.SearchBucket.SearchType;
96+
import org.icatproject.lucene.analyzers.IcatSeparatorAnalyzer;
97+
import org.icatproject.lucene.analyzers.IcatSynonymAnalyzer;
9198
import org.icatproject.lucene.exceptions.LuceneException;
9299
import org.icatproject.utils.CheckedProperties;
93100
import org.icatproject.utils.IcatUnits;
@@ -125,7 +132,7 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException {
125132
AsyncFSLockFactory lockFactory = AsyncFSLockFactory.INSTANCE;
126133
directory = new RAFDirectory(shardPath, lockFactory);
127134
logger.info("RAFDirectory opened for {}", shardPath);
128-
IndexWriterConfig config = new IndexWriterConfig(analyzer);
135+
IndexWriterConfig config = new IndexWriterConfig(analyzerWrapper);
129136
indexWriter = new IndexWriter(directory, config);
130137
String[] files = directory.listAll();
131138
if (files.length == 1 && files[0].equals("write.lock")) {
@@ -201,7 +208,7 @@ private void initState(IndexSearcher indexSearcher) throws IOException {
201208
*/
202209
public void ensureOpen() throws IOException, LuceneException {
203210
if (!indexWriter.isOpen()) {
204-
IndexWriterConfig config = new IndexWriterConfig(analyzer);
211+
IndexWriterConfig config = new IndexWriterConfig(analyzerWrapper);
205212
indexWriter = new IndexWriter(directory, config);
206213
searcherManager = new SearcherManager(indexWriter, null);
207214
IndexSearcher indexSearcher = searcherManager.acquire();
@@ -426,7 +433,20 @@ public void releaseSearchers(List<IndexSearcher> subSearchers) throws IOExceptio
426433

427434
static final Logger logger = LoggerFactory.getLogger(Lucene.class);
428435
private static final Marker fatal = MarkerFactory.getMarker("FATAL");
429-
private static final IcatSynonymAnalyzer analyzer = new IcatSynonymAnalyzer();
436+
private static final HashMap<String, Analyzer> analyzerMap = new HashMap<>();
437+
static {
438+
for (String pathField : DocumentMapping.pathFields) {
439+
analyzerMap.put(pathField, new IcatSeparatorAnalyzer("/"));
440+
try {
441+
Builder builder = CustomAnalyzer.builder().withTokenizer(PathHierarchyTokenizerFactory.class);
442+
analyzerMap.put(pathField + ".exact", builder.build());
443+
} catch (IOException e) {
444+
logger.error("Could not initialize path hierarchy analyzer", e);
445+
}
446+
analyzerMap.put(pathField + ".fileName", new IcatSeparatorAnalyzer("."));
447+
}
448+
}
449+
public static PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(new IcatSynonymAnalyzer(), analyzerMap);
430450

431451
private final FacetsConfig facetsConfig = new FacetsConfig();
432452

0 commit comments

Comments
 (0)