Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] adds new feature_processors field for data frame analytics #60528

Merged
merged 15 commits into from
Aug 14, 2020
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ public Tuple<ExtractedFields, List<FieldSelection>> detect() {
ExtractedFields extractedFields = detectExtractedFields(fields, fieldSelection, processedFields);
addIncludedFields(extractedFields, fieldSelection);

checkOutputFeatureUniqueness(processedFields, fields);

return Tuple.tuple(extractedFields, Collections.unmodifiableList(new ArrayList<>(fieldSelection)));
}

Expand Down Expand Up @@ -525,6 +527,37 @@ private void addIncludedFields(ExtractedFields extractedFields, Set<FieldSelecti
}
}

static void checkOutputFeatureUniqueness(List<ProcessedField> processedFields, Set<String> selectedFields) {
Set<String> processInputs = processedFields.stream()
.map(ProcessedField::getInputFieldNames)
.flatMap(List::stream)
.collect(Collectors.toSet());
// All analysis fields that we include that are NOT processed
// This indicates that they are sent as is
Set<String> organicFields = Sets.difference(selectedFields, processInputs);

Set<String> processedFeatures = new HashSet<>();
Set<String> duplicatedFields = new HashSet<>();
for (ProcessedField processedField : processedFields) {
for (String output : processedField.getOutputFieldNames()) {
if(processedFeatures.add(output) == false) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: space after if

duplicatedFields.add(output);
}
}
}
if (duplicatedFields.isEmpty() == false) {
throw ExceptionsHelper.badRequestException(
"feature_processors must define unique output field names; duplicate fields {}",
duplicatedFields);
}
Set<String> duplicateOrganicAndProcessed = Sets.intersection(organicFields, processedFeatures);
if (duplicateOrganicAndProcessed.isEmpty() == false) {
throw ExceptionsHelper.badRequestException(
"feature_processors output fields must not include non-processed analysis fields; duplicate fields {}",
duplicateOrganicAndProcessed);
}
}

static Set<String> getCategoricalInputFields(ExtractedFields extractedFields, DataFrameAnalysis analysis) {
return extractedFields.getAllFields().stream()
.filter(extractedField -> analysis.getAllowedCategoricalTypes(extractedField.getName())
Expand Down