From ed88c9bd4748bf3b8ff399eb0ac03df8719a5d6a Mon Sep 17 00:00:00 2001 From: Ben Reinhart Date: Fri, 26 Mar 2021 15:22:00 -0700 Subject: [PATCH] fix(#13734): Properly escape special characters in CSV output (#13735) * fix: Escape csv content during downloads * Reuse CsvResponse object * Use correct mimetype for csv responses * Ensure that headers are also escaped * Update escaping logic --- superset/charts/api.py | 7 +-- superset/common/query_context.py | 5 +- superset/utils/csv.py | 67 ++++++++++++++++++++++++++ superset/views/base.py | 1 + superset/views/core.py | 17 ++----- superset/viz.py | 4 +- tests/utils/csv_tests.py | 80 ++++++++++++++++++++++++++++++++ 7 files changed, 160 insertions(+), 21 deletions(-) create mode 100644 superset/utils/csv.py create mode 100644 tests/utils/csv_tests.py diff --git a/superset/charts/api.py b/superset/charts/api.py index c84a84351e840..83c0eb035a5eb 100644 --- a/superset/charts/api.py +++ b/superset/charts/api.py @@ -484,12 +484,7 @@ def get_data_response( if result_format == ChartDataResultFormat.CSV: # return the first result data = result["queries"][0]["data"] - return CsvResponse( - data, - status=200, - headers=generate_download_headers("csv"), - mimetype="application/csv", - ) + return CsvResponse(data, headers=generate_download_headers("csv")) if result_format == ChartDataResultFormat.JSON: response_data = simplejson.dumps( diff --git a/superset/common/query_context.py b/superset/common/query_context.py index a740af5490bf4..e8b9fe85b46c2 100644 --- a/superset/common/query_context.py +++ b/superset/common/query_context.py @@ -35,6 +35,7 @@ ) from superset.extensions import cache_manager, security_manager from superset.stats_logger import BaseStatsLogger +from superset.utils import csv from superset.utils.cache import generate_cache_key, set_and_log_cache from superset.utils.core import ( ChartDataResultFormat, @@ -151,7 +152,9 @@ def df_metrics_to_num(df: pd.DataFrame, query_object: QueryObject) -> None: def get_data(self, df: pd.DataFrame,) -> Union[str, List[Dict[str, Any]]]: if self.result_format == ChartDataResultFormat.CSV: include_index = not isinstance(df.index, pd.RangeIndex) - result = df.to_csv(index=include_index, **config["CSV_EXPORT"]) + result = csv.df_to_escaped_csv( + df, index=include_index, **config["CSV_EXPORT"] + ) return result or "" return df.to_dict(orient="records") diff --git a/superset/utils/csv.py b/superset/utils/csv.py new file mode 100644 index 0000000000000..d91d427d5ac71 --- /dev/null +++ b/superset/utils/csv.py @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import re +from typing import Any + +import pandas as pd + +negative_number_re = re.compile(r"^-[0-9.]+$") + +# This regex will match if the string starts with: +# +# 1. one of -, @, +, |, =, % +# 2. two double quotes immediately followed by one of -, @, +, |, =, % +# 3. one or more spaces immediately followed by one of -, @, +, |, =, % +# +problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]') + + +def escape_value(value: str) -> str: + """ + Escapes a set of special characters. + + http://georgemauer.net/2017/10/07/csv-injection.html + """ + needs_escaping = problematic_chars_re.match(value) is not None + is_negative_number = negative_number_re.match(value) is not None + + if needs_escaping and not is_negative_number: + # Escape pipe to be extra safe as this + # can lead to remote code execution + value = value.replace("|", "\\|") + + # Precede the line with a single quote. This prevents + # evaluation of commands and some spreadsheet software + # will hide this visually from the user. Many articles + # claim a preceding space will work here too, however, + # when uploading a csv file in Google sheets, a leading + # space was ignored and code was still evaluated. + value = "'" + value + + return value + + +def df_to_escaped_csv(df: pd.DataFrame, **kwargs: Any) -> Any: + escape_values = lambda v: escape_value(v) if isinstance(v, str) else v + + # Escape csv headers + df = df.rename(columns=escape_values) + + # Escape csv rows + df = df.applymap(escape_values) + + return df.to_csv(**kwargs) diff --git a/superset/views/base.py b/superset/views/base.py index cb486c51357ab..ea80bb6098f7e 100644 --- a/superset/views/base.py +++ b/superset/views/base.py @@ -479,6 +479,7 @@ class CsvResponse(Response): # pylint: disable=too-many-ancestors """ charset = conf["CSV_EXPORT"].get("encoding", "utf-8") + default_mimetype = "text/csv" def check_ownership(obj: Any, raise_if_false: bool = True) -> bool: diff --git a/superset/views/core.py b/superset/views/core.py index 2c390c4062dd2..42d50a9133f2e 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -98,7 +98,7 @@ from superset.sql_validators import get_validator_by_name from superset.tasks.async_queries import load_explore_json_into_cache from superset.typing import FlaskResponse -from superset.utils import core as utils +from superset.utils import core as utils, csv from superset.utils.async_query_manager import AsyncQueryTokenException from superset.utils.cache import etag_cache from superset.utils.core import ReservedUrlParameters @@ -437,10 +437,7 @@ def generate_json( ) -> FlaskResponse: if response_type == utils.ChartDataResultFormat.CSV: return CsvResponse( - viz_obj.get_csv(), - status=200, - headers=generate_download_headers("csv"), - mimetype="application/csv", + viz_obj.get_csv(), headers=generate_download_headers("csv") ) if response_type == utils.ChartDataResultType.QUERY: @@ -2622,18 +2619,14 @@ def csv(self, client_id: str) -> FlaskResponse: # pylint: disable=no-self-use columns = [c["name"] for c in obj["columns"]] df = pd.DataFrame.from_records(obj["data"], columns=columns) logger.info("Using pandas to convert to CSV") - csv = df.to_csv(index=False, **config["CSV_EXPORT"]) else: logger.info("Running a query to turn into CSV") sql = query.select_sql or query.executed_sql df = query.database.get_df(sql, query.schema) - # TODO(bkyryliuk): add compression=gzip for big files. - csv = df.to_csv(index=False, **config["CSV_EXPORT"]) - response = Response(csv, mimetype="text/csv") + csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"]) quoted_csv_name = parse.quote(query.name) - response.headers["Content-Disposition"] = ( - f'attachment; filename="{quoted_csv_name}.csv"; ' - f"filename*=UTF-8''{quoted_csv_name}.csv" + response = CsvResponse( + csv_data, headers=generate_download_headers("csv", quoted_csv_name) ) event_info = { "event_type": "data_export", diff --git a/superset/viz.py b/superset/viz.py index b00a945ccc31a..d00ae5be19a41 100644 --- a/superset/viz.py +++ b/superset/viz.py @@ -66,7 +66,7 @@ from superset.models.cache import CacheKey from superset.models.helpers import QueryResult from superset.typing import QueryObjectDict, VizData, VizPayload -from superset.utils import core as utils +from superset.utils import core as utils, csv from superset.utils.cache import set_and_log_cache from superset.utils.core import ( DTTM_ALIAS, @@ -614,7 +614,7 @@ def data(self) -> Dict[str, Any]: def get_csv(self) -> Optional[str]: df = self.get_df_payload()["df"] # leverage caching logic include_index = not isinstance(df.index, pd.RangeIndex) - return df.to_csv(index=include_index, **config["CSV_EXPORT"]) + return csv.df_to_escaped_csv(df, index=include_index, **config["CSV_EXPORT"]) def get_data(self, df: pd.DataFrame) -> VizData: return df.to_dict(orient="records") diff --git a/tests/utils/csv_tests.py b/tests/utils/csv_tests.py new file mode 100644 index 0000000000000..e992fcb5931ca --- /dev/null +++ b/tests/utils/csv_tests.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=no-self-use +import io + +import pandas as pd +import pytest + +from superset.utils import csv + + +def test_escape_value(): + result = csv.escape_value("value") + assert result == "value" + + result = csv.escape_value("-10") + assert result == "-10" + + result = csv.escape_value("@value") + assert result == "'@value" + + result = csv.escape_value("+value") + assert result == "'+value" + + result = csv.escape_value("-value") + assert result == "'-value" + + result = csv.escape_value("=value") + assert result == "'=value" + + result = csv.escape_value("|value") + assert result == "'\|value" + + result = csv.escape_value("%value") + assert result == "'%value" + + result = csv.escape_value("=cmd|' /C calc'!A0") + assert result == "'=cmd\|' /C calc'!A0" + + result = csv.escape_value('""=10+2') + assert result == '\'""=10+2' + + result = csv.escape_value(" =10+2") + assert result == "' =10+2" + + +def test_df_to_escaped_csv(): + csv_rows = [ + ["col_a", "=func()"], + ["-10", "=cmd|' /C calc'!A0"], + ["a", '""=b'], + [" =a", "b"], + ] + csv_str = "\n".join([",".join(row) for row in csv_rows]) + + df = pd.read_csv(io.StringIO(csv_str)) + + escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False) + escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")] + + assert escaped_csv_rows == [ + ["col_a", "'=func()"], + ["-10", "'=cmd\|' /C calc'!A0"], + ["a", "'=b"], # pandas seems to be removing the leading "" + ["' =a", "b"], + ]