From 3988fbb4b8e635d2d26b97cf7572d517c21f93fe Mon Sep 17 00:00:00 2001 From: Arjan Seijkens Date: Fri, 3 Jul 2020 15:29:51 +0200 Subject: [PATCH 1/3] Created a unit test, that MetaModel is unable to work with a document indexed by Elasticsearch which contains dots in its fieldnames. Note that this is actually caused by Elasticsearch, because the mapping returned for such a document by Elasticsearch doesn't match the source returned by Elasticsearch when getting the source of a search hit. (see https://github.com/elastic/elasticsearch-hadoop/issues/853 and related issues for more info on this). --- .../rest/ElasticSearchRestNestedDataIT.java | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/elasticsearch/rest/src/test/java/org/apache/metamodel/elasticsearch/rest/ElasticSearchRestNestedDataIT.java b/elasticsearch/rest/src/test/java/org/apache/metamodel/elasticsearch/rest/ElasticSearchRestNestedDataIT.java index 890cda77b..1265d619d 100644 --- a/elasticsearch/rest/src/test/java/org/apache/metamodel/elasticsearch/rest/ElasticSearchRestNestedDataIT.java +++ b/elasticsearch/rest/src/test/java/org/apache/metamodel/elasticsearch/rest/ElasticSearchRestNestedDataIT.java @@ -38,6 +38,7 @@ import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.client.indices.CreateIndexRequest; +import org.elasticsearch.common.xcontent.XContentType; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -111,4 +112,38 @@ public void testNestedData() throws Exception { assertEquals("Main street 1, Newville", userValueMap.get("address")); } } + + @Test + public void testIndexOfDocumentWithDots() throws Exception { + final String document = + "{ \"user.fullname\": \"John Doe\", " + + "\"user.address\": \"Main street 1, Newville\", " + + "\"message\": \"This is what I have to say.\" }"; + + final IndexRequest indexRequest = new IndexRequest(INDEX_NAME).id("1"); + indexRequest.source(document, XContentType.JSON); + + client.index(indexRequest, RequestOptions.DEFAULT); + + final Table table = dataContext.getDefaultSchema().getTableByName(DEFAULT_TABLE_NAME); + + assertThat(table.getColumnNames(), containsInAnyOrder("_id", "message", "user")); + + dataContext.refreshSchemas(); + + try (final DataSet dataSet = dataContext + .query() + .from(DEFAULT_TABLE_NAME) + .select("user") + .and("message") + .execute()) { + assertEquals(ElasticSearchRestDataSet.class, dataSet.getClass()); + + assertTrue(dataSet.next()); + final Row row = dataSet.getRow(); + assertEquals("This is what I have to say.", row.getValue(table.getColumnByName("message"))); + + assertNotNull(row.getValue(table.getColumnByName("user"))); + } + } } From 25f665b12abd1379b7f3f8fe258f043a4ac1e2d3 Mon Sep 17 00:00:00 2001 From: Arjan Seijkens Date: Mon, 6 Jul 2020 13:10:43 +0200 Subject: [PATCH 2/3] Introduced a workaround for the fact that Elasticsearch returns data based on indexed documents with dots in the field names in an inconsistent manner. I.e the mapping it returns for such documents indicate those kind of fields are treated as nested objects. But if you get the source map of such a document from Elasticsearch, it still contains the original fieldnames (with dots in them). --- CHANGES.md | 1 + .../common/ElasticSearchUtils.java | 39 ++++++++++++ .../rest/ElasticSearchRestNestedDataIT.java | 63 +++++++++---------- 3 files changed, 70 insertions(+), 33 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 41fb5bde0..67fcf4cce 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,6 @@ ### Apache MetaModel [WIP] + * [METAMODEL-1228] - Better handling of fieldnames with dots in Elasticsearch * [METAMODEL-1227] - Better handling of nested objects in Elasticsearch data * [METAMODEL-1224] - Ensured compatibility with newer versions of PostgreSQL diff --git a/elasticsearch/common/src/main/java/org/apache/metamodel/elasticsearch/common/ElasticSearchUtils.java b/elasticsearch/common/src/main/java/org/apache/metamodel/elasticsearch/common/ElasticSearchUtils.java index 6ab4fa68c..57ca7a145 100644 --- a/elasticsearch/common/src/main/java/org/apache/metamodel/elasticsearch/common/ElasticSearchUtils.java +++ b/elasticsearch/common/src/main/java/org/apache/metamodel/elasticsearch/common/ElasticSearchUtils.java @@ -290,6 +290,23 @@ public static Row createRow(final Map sourceMap, final String do } else { values[i] = valueToDate; } + } else if (column.getType() == ColumnType.MAP && value == null) { + // Because of a bug in Elasticsearch, when field names contain dots, it's possible that the + // mapping of the index described a column to be of the type "MAP", while it's based on a number + // of fields contains dots in their name. In this case we may have to work around that + // inconsistency by creating column names with dots ourselves, based on the schema. + final Map valueMap = new HashMap<>(); + + sourceMap + .keySet() + .stream() + .filter(fieldName -> fieldName.startsWith(column.getName() + ".")) + .forEach(fieldName -> evaluateField(sourceMap, valueMap, fieldName, fieldName + .substring(fieldName.indexOf('.') + 1))); + + if (!valueMap.isEmpty()) { + values[i] = valueMap; + } } else { values[i] = value; } @@ -299,4 +316,26 @@ public static Row createRow(final Map sourceMap, final String do return new DefaultRow(header, values); } + + private static void evaluateField(final Map sourceMap, final Map valueMap, + final String sourceFieldName, final String subFieldName) { + if (subFieldName.contains(".")) { + @SuppressWarnings("unchecked") + final Map nestedValueMap = (Map) valueMap + .computeIfAbsent(subFieldName.substring(0, subFieldName.indexOf('.')), key -> createNestedValueMap( + valueMap, key)); + + evaluateField(sourceMap, nestedValueMap, sourceFieldName, subFieldName + .substring(subFieldName.indexOf('.') + 1)); + } else { + valueMap.put(subFieldName, sourceMap.get(sourceFieldName)); + } + } + + private static Object createNestedValueMap(final Map valueMap, final String nestedFieldName) { + final Map nestedValueMap = new HashMap<>(); + valueMap.put(nestedFieldName, nestedValueMap); + + return nestedValueMap; + } } diff --git a/elasticsearch/rest/src/test/java/org/apache/metamodel/elasticsearch/rest/ElasticSearchRestNestedDataIT.java b/elasticsearch/rest/src/test/java/org/apache/metamodel/elasticsearch/rest/ElasticSearchRestNestedDataIT.java index 1265d619d..25b06220d 100644 --- a/elasticsearch/rest/src/test/java/org/apache/metamodel/elasticsearch/rest/ElasticSearchRestNestedDataIT.java +++ b/elasticsearch/rest/src/test/java/org/apache/metamodel/elasticsearch/rest/ElasticSearchRestNestedDataIT.java @@ -69,9 +69,13 @@ public void tearDown() throws IOException { @Test public void testNestedData() throws Exception { + final Map address = new HashMap<>(); + address.put("street", "Main street 1"); + address.put("city", "Newville"); + final Map user = new HashMap<>(); user.put("fullname", "John Doe"); - user.put("address", "Main street 1, Newville"); + user.put("address", address); final Map userMessage = new LinkedHashMap<>(); userMessage.put("user", user); @@ -82,42 +86,15 @@ public void testNestedData() throws Exception { client.index(indexRequest, RequestOptions.DEFAULT); - final Table table = dataContext.getDefaultSchema().getTableByName(DEFAULT_TABLE_NAME); - - assertThat(table.getColumnNames(), containsInAnyOrder("_id", "message", "user")); - - assertEquals(ColumnType.STRING, table.getColumnByName("message").getType()); - assertEquals(ColumnType.MAP, table.getColumnByName("user").getType()); - - dataContext.refreshSchemas(); - - try (final DataSet dataSet = dataContext - .query() - .from(DEFAULT_TABLE_NAME) - .select("user") - .and("message") - .execute()) { - assertEquals(ElasticSearchRestDataSet.class, dataSet.getClass()); - - assertTrue(dataSet.next()); - final Row row = dataSet.getRow(); - assertEquals("This is what I have to say.", row.getValue(table.getColumnByName("message"))); - - final Object userValue = row.getValue(table.getColumnByName("user")); - assertTrue(userValue instanceof Map); - - @SuppressWarnings("rawtypes") - final Map userValueMap = (Map) userValue; - assertEquals("John Doe", userValueMap.get("fullname")); - assertEquals("Main street 1, Newville", userValueMap.get("address")); - } + validateSchemaAndResults(); } @Test public void testIndexOfDocumentWithDots() throws Exception { final String document = "{ \"user.fullname\": \"John Doe\", " - + "\"user.address\": \"Main street 1, Newville\", " + + "\"user.address.street\": \"Main street 1\", " + + "\"user.address.city\": \"Newville\", " + "\"message\": \"This is what I have to say.\" }"; final IndexRequest indexRequest = new IndexRequest(INDEX_NAME).id("1"); @@ -125,12 +102,19 @@ public void testIndexOfDocumentWithDots() throws Exception { client.index(indexRequest, RequestOptions.DEFAULT); + validateSchemaAndResults(); + } + + private void validateSchemaAndResults() { final Table table = dataContext.getDefaultSchema().getTableByName(DEFAULT_TABLE_NAME); assertThat(table.getColumnNames(), containsInAnyOrder("_id", "message", "user")); - dataContext.refreshSchemas(); + assertEquals(ColumnType.STRING, table.getColumnByName("message").getType()); + assertEquals(ColumnType.MAP, table.getColumnByName("user").getType()); + dataContext.refreshSchemas(); + try (final DataSet dataSet = dataContext .query() .from(DEFAULT_TABLE_NAME) @@ -142,8 +126,21 @@ public void testIndexOfDocumentWithDots() throws Exception { assertTrue(dataSet.next()); final Row row = dataSet.getRow(); assertEquals("This is what I have to say.", row.getValue(table.getColumnByName("message"))); + + final Object userValue = row.getValue(table.getColumnByName("user")); + assertTrue(userValue instanceof Map); - assertNotNull(row.getValue(table.getColumnByName("user"))); + @SuppressWarnings("rawtypes") + final Map userValueMap = (Map) userValue; + assertEquals("John Doe", userValueMap.get("fullname")); + + final Object addressValue = userValueMap.get("address"); + assertTrue(addressValue instanceof Map); + + @SuppressWarnings("rawtypes") + final Map addressValueMap = (Map) addressValue; + assertEquals("Main street 1", addressValueMap.get("street")); + assertEquals("Newville", addressValueMap.get("city")); } } } From 3ae1d7c774dd3eb8ef77d151f10cad540a603772 Mon Sep 17 00:00:00 2001 From: Arjan Seijkens Date: Tue, 7 Jul 2020 07:57:58 +0200 Subject: [PATCH 3/3] Fixed typo. --- .../metamodel/elasticsearch/common/ElasticSearchUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elasticsearch/common/src/main/java/org/apache/metamodel/elasticsearch/common/ElasticSearchUtils.java b/elasticsearch/common/src/main/java/org/apache/metamodel/elasticsearch/common/ElasticSearchUtils.java index 57ca7a145..f5c70e86b 100644 --- a/elasticsearch/common/src/main/java/org/apache/metamodel/elasticsearch/common/ElasticSearchUtils.java +++ b/elasticsearch/common/src/main/java/org/apache/metamodel/elasticsearch/common/ElasticSearchUtils.java @@ -293,7 +293,7 @@ public static Row createRow(final Map sourceMap, final String do } else if (column.getType() == ColumnType.MAP && value == null) { // Because of a bug in Elasticsearch, when field names contain dots, it's possible that the // mapping of the index described a column to be of the type "MAP", while it's based on a number - // of fields contains dots in their name. In this case we may have to work around that + // of fields containing dots in their name. In this case we may have to work around that // inconsistency by creating column names with dots ourselves, based on the schema. final Map valueMap = new HashMap<>();