From 2f261e5f60c00b89c2449001c494b13254f90273 Mon Sep 17 00:00:00 2001 From: Rich Megginson Date: Fri, 8 Feb 2019 14:54:21 -0700 Subject: [PATCH] undefined_to_string - convert unknown fields to string value --- README.md | 69 ++++++++++++++++++++- lib/fluent/plugin/filter_viaq_data_model.rb | 11 ++++ test/test_filter_viaq_data_model.rb | 18 ++++++ 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b27bb4c..68ef2fc 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,9 @@ See `filter-viaq_data_model.conf` for an example filter configuration. * `default_keep_fields` - comma delimited string - default: `''` * This is the default list of fields to keep as top level fields in the record * `default_keep_fields message,@timestamp,ident` - do not move these fields into the `undefined` field + * The default list of fields comes from the list of top level fields defined in the + ViaQ https://github.com/ViaQ/elasticsearch-templates - see below for an example of how to extract + those fields to set the default value for `default_keep_fields` * `extra_keep_fields` - comma delimited string - default: `''` * This is an extra list of fields to keep in addition to `default_keep_fields` - mostly useful as a way to hard code the @@ -80,6 +83,8 @@ See `filter-viaq_data_model.conf` for an example filter configuration. * `undefined_name` - string - default `"undefined"` * Name of undefined top level field to use if `use_undefined true` is set * `undefined_name myfields` - keep undefined fields under field `myfields` +* `undefined_to_string` - boolean - default `true` + * normalize undefined values to be string valued - see below * `rename_time` - boolean - default `true` * Rename the time field e.g. when you need to set `@timestamp` in the record * NOTE: This will overwrite the `dest_time_name` if already set @@ -145,7 +150,69 @@ See `filter-viaq_data_model.conf` for an example filter configuration. in the file. This means, don't use `tag "**"` as the first formatter or none of your others will be matched or evaulated. -## Example +## How to get fields for `default_keep_fields` + +If you have https://github.com/ViaQ/elasticsearch-templates cloned locally in +`../elasticsearch-templates`: + + python -c 'import sys,yaml + uniquefields = {} + for ff in sys.argv[1:]: + hsh = yaml.load(open(ff)) + print hsh + if 0 < ff.find("_default_.yml"): + # default is a special case + for ent in hsh["_default_"]["fields"]: + fieldname = ent["name"] + uniquefields[fieldname] = fieldname + else: + fieldname = hsh.get("namespace") + if fieldname: + fieldname = hsh["namespace"]["name"] + uniquefields[fieldname] = fieldname + else: + fieldname = hsh.keys()[0] + uniquefields[fieldname] = fieldname + print ",".join(sorted(uniquefields.keys())) + ' $( find ../elasticsearch-templates/namespaces -name \*.yml ) + +## `undefined_to_string` +One of the problems with storing data in Elasticsearch is that it really +requires you to have strict control over the fields and the number of fields +being stored. You typically have to define a strict input pipeline for +formatting the data, and define index templates to specify the type of data. +If you are dealing with unstructured data, you run into the risk that you have +a field named `fieldname` which in some records has a `string` value, but in +other documents may have an `int` value or a value of some other data type. +To mitigate this situation, the viaq plugin will convert unknown fields to their +JSON string representation. For example, if you have the following configuration: + + undefined_to_string true + +and you get a record that looks like this: + + { + "message":"my message", + "stringfield":"this is a string", + "status":404, + "compositefield":{"a":"b"}, + "anarray":[1, 2, 3] + } + +The end result would look like this: + + { + "message":"my message", + "stringfield":"this is a string", + "status":"404", + "compositefield":"{\"a\":\"b\"}", + "anarray":"[1, 2, 3]" + } + +That is, the value of any unknown fields will be converted to their JSON string +representation. + +## Example - default values - undefined_to_string false If the input record looks like this: diff --git a/lib/fluent/plugin/filter_viaq_data_model.rb b/lib/fluent/plugin/filter_viaq_data_model.rb index 5f5185d..24687d7 100644 --- a/lib/fluent/plugin/filter_viaq_data_model.rb +++ b/lib/fluent/plugin/filter_viaq_data_model.rb @@ -17,6 +17,7 @@ # require 'time' require 'date' +require 'json' require 'fluent/filter' require 'fluent/log' @@ -76,6 +77,9 @@ class ViaqDataModelFilter < Filter desc 'Name of undefined field to store fields not in above lists if use_undefined is true' config_param :undefined_name, :string, default: 'undefined' + desc 'Normalize undefined fields to string - highly recommended' + config_param :undefined_to_string, :bool, default: false + # we can't directly add a field called @timestamp in a record_transform # filter because the '@' is special to fluentd desc 'Rename timestamp field to Elasticsearch compatible name' @@ -458,6 +462,13 @@ def filter(tag, time, record) check_for_match_and_format(tag, time, record) add_pipeline_metadata(tag, time, record) + if @undefined_to_string + record.each do |k,v| + unless @keep_fields.key?(k) || (v.is_a?(String)) + record[k] = JSON.dump(v) + end + end + end if @use_undefined # undefined contains all of the fields not in keep_fields undefined = record.reject{|k,v| @keep_fields.key?(k)} diff --git a/test/test_filter_viaq_data_model.rb b/test/test_filter_viaq_data_model.rb index e2b0e26..431a1ae 100644 --- a/test/test_filter_viaq_data_model.rb +++ b/test/test_filter_viaq_data_model.rb @@ -1463,4 +1463,22 @@ def add_event(input) assert_equal('crit', rec['level']) end end + + sub_test_case 'undefined_to_string' do + def emit_with_tag(tag, msg={}, conf='') + d = create_driver(conf) + d.run { + d.emit_with_tag(tag, msg, @time) + }.filtered.instance_variable_get(:@record_array)[0] + end + test 'see if undefined fields are normalized to string and kept at top level' do + rec = emit_with_tag('tag', {'a'=>'b','c'=>404,'d'=>{'e'=>'f'},'g'=>[1, 2, 3]}, ' + undefined_to_string true + ') + assert_equal('b', rec['a']) + assert_equal('404', rec['c']) + assert_equal('{"e":"f"}', rec['d']) + assert_equal('[1,2,3]', rec['g']) + end + end end