fix(apache#13734): Properly escape special characters in CSV output (a…

…pache#13735) * fix: Escape csv content during downloads * Reuse CsvResponse object * Use correct mimetype for csv responses * Ensure that headers are also escaped * Update escaping logic
betodealmeida · Mar 26, 2021 · ed88c9b · ed88c9b
1 parent d72c3b0
commit ed88c9b
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 21 deletions.
diff --git a/superset/charts/api.py b/superset/charts/api.py
@@ -484,12 +484,7 @@ def get_data_response(
         if result_format == ChartDataResultFormat.CSV:
             # return the first result
             data = result["queries"][0]["data"]
-            return CsvResponse(
-                data,
-                status=200,
-                headers=generate_download_headers("csv"),
-                mimetype="application/csv",
-            )
+            return CsvResponse(data, headers=generate_download_headers("csv"))
 
         if result_format == ChartDataResultFormat.JSON:
             response_data = simplejson.dumps(

diff --git a/superset/common/query_context.py b/superset/common/query_context.py
@@ -35,6 +35,7 @@
 )
 from superset.extensions import cache_manager, security_manager
 from superset.stats_logger import BaseStatsLogger
+from superset.utils import csv
 from superset.utils.cache import generate_cache_key, set_and_log_cache
 from superset.utils.core import (
     ChartDataResultFormat,
@@ -151,7 +152,9 @@ def df_metrics_to_num(df: pd.DataFrame, query_object: QueryObject) -> None:
     def get_data(self, df: pd.DataFrame,) -> Union[str, List[Dict[str, Any]]]:
         if self.result_format == ChartDataResultFormat.CSV:
             include_index = not isinstance(df.index, pd.RangeIndex)
-            result = df.to_csv(index=include_index, **config["CSV_EXPORT"])
+            result = csv.df_to_escaped_csv(
+                df, index=include_index, **config["CSV_EXPORT"]
+            )
             return result or ""
 
         return df.to_dict(orient="records")

diff --git a/superset/utils/csv.py b/superset/utils/csv.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import re
+from typing import Any
+
+import pandas as pd
+
+negative_number_re = re.compile(r"^-[0-9.]+$")
+
+# This regex will match if the string starts with:
+#
+#     1. one of -, @, +, |, =, %
+#     2. two double quotes immediately followed by one of -, @, +, |, =, %
+#     3. one or more spaces immediately followed by one of -, @, +, |, =, %
+#
+problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]')
+
+
+def escape_value(value: str) -> str:
+    """
+    Escapes a set of special characters.
+
+    http://georgemauer.net/2017/10/07/csv-injection.html
+    """
+    needs_escaping = problematic_chars_re.match(value) is not None
+    is_negative_number = negative_number_re.match(value) is not None
+
+    if needs_escaping and not is_negative_number:
+        # Escape pipe to be extra safe as this
+        # can lead to remote code execution
+        value = value.replace("|", "\\|")
+
+        # Precede the line with a single quote. This prevents
+        # evaluation of commands and some spreadsheet software
+        # will hide this visually from the user. Many articles
+        # claim a preceding space will work here too, however,
+        # when uploading a csv file in Google sheets, a leading
+        # space was ignored and code was still evaluated.
+        value = "'" + value
+
+    return value
+
+
+def df_to_escaped_csv(df: pd.DataFrame, **kwargs: Any) -> Any:
+    escape_values = lambda v: escape_value(v) if isinstance(v, str) else v
+
+    # Escape csv headers
+    df = df.rename(columns=escape_values)
+
+    # Escape csv rows
+    df = df.applymap(escape_values)
+
+    return df.to_csv(**kwargs)
diff --git a/superset/views/base.py b/superset/views/base.py
@@ -479,6 +479,7 @@ class CsvResponse(Response):  # pylint: disable=too-many-ancestors
     """
 
     charset = conf["CSV_EXPORT"].get("encoding", "utf-8")
+    default_mimetype = "text/csv"
 
 
 def check_ownership(obj: Any, raise_if_false: bool = True) -> bool:

diff --git a/superset/views/core.py b/superset/views/core.py
@@ -98,7 +98,7 @@
 from superset.sql_validators import get_validator_by_name
 from superset.tasks.async_queries import load_explore_json_into_cache
 from superset.typing import FlaskResponse
-from superset.utils import core as utils
+from superset.utils import core as utils, csv
 from superset.utils.async_query_manager import AsyncQueryTokenException
 from superset.utils.cache import etag_cache
 from superset.utils.core import ReservedUrlParameters
@@ -437,10 +437,7 @@ def generate_json(
     ) -> FlaskResponse:
         if response_type == utils.ChartDataResultFormat.CSV:
             return CsvResponse(
-                viz_obj.get_csv(),
-                status=200,
-                headers=generate_download_headers("csv"),
-                mimetype="application/csv",
+                viz_obj.get_csv(), headers=generate_download_headers("csv")
             )
 
         if response_type == utils.ChartDataResultType.QUERY:
@@ -2622,18 +2619,14 @@ def csv(self, client_id: str) -> FlaskResponse:  # pylint: disable=no-self-use
             columns = [c["name"] for c in obj["columns"]]
             df = pd.DataFrame.from_records(obj["data"], columns=columns)
             logger.info("Using pandas to convert to CSV")
-            csv = df.to_csv(index=False, **config["CSV_EXPORT"])
         else:
             logger.info("Running a query to turn into CSV")
             sql = query.select_sql or query.executed_sql
             df = query.database.get_df(sql, query.schema)
-            # TODO(bkyryliuk): add compression=gzip for big files.
-            csv = df.to_csv(index=False, **config["CSV_EXPORT"])
-        response = Response(csv, mimetype="text/csv")
+        csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
         quoted_csv_name = parse.quote(query.name)
-        response.headers["Content-Disposition"] = (
-            f'attachment; filename="{quoted_csv_name}.csv"; '
-            f"filename*=UTF-8''{quoted_csv_name}.csv"
+        response = CsvResponse(
+            csv_data, headers=generate_download_headers("csv", quoted_csv_name)
         )
         event_info = {
             "event_type": "data_export",

diff --git a/superset/viz.py b/superset/viz.py
@@ -66,7 +66,7 @@
 from superset.models.cache import CacheKey
 from superset.models.helpers import QueryResult
 from superset.typing import QueryObjectDict, VizData, VizPayload
-from superset.utils import core as utils
+from superset.utils import core as utils, csv
 from superset.utils.cache import set_and_log_cache
 from superset.utils.core import (
     DTTM_ALIAS,
@@ -614,7 +614,7 @@ def data(self) -> Dict[str, Any]:
     def get_csv(self) -> Optional[str]:
         df = self.get_df_payload()["df"]  # leverage caching logic
         include_index = not isinstance(df.index, pd.RangeIndex)
-        return df.to_csv(index=include_index, **config["CSV_EXPORT"])
+        return csv.df_to_escaped_csv(df, index=include_index, **config["CSV_EXPORT"])
 
     def get_data(self, df: pd.DataFrame) -> VizData:
         return df.to_dict(orient="records")

diff --git a/tests/utils/csv_tests.py b/tests/utils/csv_tests.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-self-use
+import io
+
+import pandas as pd
+import pytest
+
+from superset.utils import csv
+
+
+def test_escape_value():
+    result = csv.escape_value("value")
+    assert result == "value"
+
+    result = csv.escape_value("-10")
+    assert result == "-10"
+
+    result = csv.escape_value("@value")
+    assert result == "'@value"
+
+    result = csv.escape_value("+value")
+    assert result == "'+value"
+
+    result = csv.escape_value("-value")
+    assert result == "'-value"
+
+    result = csv.escape_value("=value")
+    assert result == "'=value"
+
+    result = csv.escape_value("|value")
+    assert result == "'\|value"
+
+    result = csv.escape_value("%value")
+    assert result == "'%value"
+
+    result = csv.escape_value("=cmd|' /C calc'!A0")
+    assert result == "'=cmd\|' /C calc'!A0"
+
+    result = csv.escape_value('""=10+2')
+    assert result == '\'""=10+2'
+
+    result = csv.escape_value(" =10+2")
+    assert result == "' =10+2"
+
+
+def test_df_to_escaped_csv():
+    csv_rows = [
+        ["col_a", "=func()"],
+        ["-10", "=cmd|' /C calc'!A0"],
+        ["a", '""=b'],
+        [" =a", "b"],
+    ]
+    csv_str = "\n".join([",".join(row) for row in csv_rows])
+
+    df = pd.read_csv(io.StringIO(csv_str))
+
+    escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False)
+    escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")]
+
+    assert escaped_csv_rows == [
+        ["col_a", "'=func()"],
+        ["-10", "'=cmd\|' /C calc'!A0"],
+        ["a", "'=b"],  # pandas seems to be removing the leading ""
+        ["' =a", "b"],
+    ]