Skip to content

Commit

Permalink
fix(apache#13734): Properly escape special characters in CSV output (a…
Browse files Browse the repository at this point in the history
…pache#13735)

* fix: Escape csv content during downloads

* Reuse CsvResponse object

* Use correct mimetype for csv responses

* Ensure that headers are also escaped

* Update escaping logic
  • Loading branch information
benjreinhart authored and betodealmeida committed Mar 26, 2021
1 parent d72c3b0 commit ed88c9b
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 21 deletions.
7 changes: 1 addition & 6 deletions superset/charts/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,12 +484,7 @@ def get_data_response(
if result_format == ChartDataResultFormat.CSV:
# return the first result
data = result["queries"][0]["data"]
return CsvResponse(
data,
status=200,
headers=generate_download_headers("csv"),
mimetype="application/csv",
)
return CsvResponse(data, headers=generate_download_headers("csv"))

if result_format == ChartDataResultFormat.JSON:
response_data = simplejson.dumps(
Expand Down
5 changes: 4 additions & 1 deletion superset/common/query_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
)
from superset.extensions import cache_manager, security_manager
from superset.stats_logger import BaseStatsLogger
from superset.utils import csv
from superset.utils.cache import generate_cache_key, set_and_log_cache
from superset.utils.core import (
ChartDataResultFormat,
Expand Down Expand Up @@ -151,7 +152,9 @@ def df_metrics_to_num(df: pd.DataFrame, query_object: QueryObject) -> None:
def get_data(self, df: pd.DataFrame,) -> Union[str, List[Dict[str, Any]]]:
if self.result_format == ChartDataResultFormat.CSV:
include_index = not isinstance(df.index, pd.RangeIndex)
result = df.to_csv(index=include_index, **config["CSV_EXPORT"])
result = csv.df_to_escaped_csv(
df, index=include_index, **config["CSV_EXPORT"]
)
return result or ""

return df.to_dict(orient="records")
Expand Down
67 changes: 67 additions & 0 deletions superset/utils/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import re
from typing import Any

import pandas as pd

negative_number_re = re.compile(r"^-[0-9.]+$")

# This regex will match if the string starts with:
#
# 1. one of -, @, +, |, =, %
# 2. two double quotes immediately followed by one of -, @, +, |, =, %
# 3. one or more spaces immediately followed by one of -, @, +, |, =, %
#
problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]')


def escape_value(value: str) -> str:
"""
Escapes a set of special characters.
http://georgemauer.net/2017/10/07/csv-injection.html
"""
needs_escaping = problematic_chars_re.match(value) is not None
is_negative_number = negative_number_re.match(value) is not None

if needs_escaping and not is_negative_number:
# Escape pipe to be extra safe as this
# can lead to remote code execution
value = value.replace("|", "\\|")

# Precede the line with a single quote. This prevents
# evaluation of commands and some spreadsheet software
# will hide this visually from the user. Many articles
# claim a preceding space will work here too, however,
# when uploading a csv file in Google sheets, a leading
# space was ignored and code was still evaluated.
value = "'" + value

return value


def df_to_escaped_csv(df: pd.DataFrame, **kwargs: Any) -> Any:
escape_values = lambda v: escape_value(v) if isinstance(v, str) else v

# Escape csv headers
df = df.rename(columns=escape_values)

# Escape csv rows
df = df.applymap(escape_values)

return df.to_csv(**kwargs)
1 change: 1 addition & 0 deletions superset/views/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ class CsvResponse(Response): # pylint: disable=too-many-ancestors
"""

charset = conf["CSV_EXPORT"].get("encoding", "utf-8")
default_mimetype = "text/csv"


def check_ownership(obj: Any, raise_if_false: bool = True) -> bool:
Expand Down
17 changes: 5 additions & 12 deletions superset/views/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
from superset.sql_validators import get_validator_by_name
from superset.tasks.async_queries import load_explore_json_into_cache
from superset.typing import FlaskResponse
from superset.utils import core as utils
from superset.utils import core as utils, csv
from superset.utils.async_query_manager import AsyncQueryTokenException
from superset.utils.cache import etag_cache
from superset.utils.core import ReservedUrlParameters
Expand Down Expand Up @@ -437,10 +437,7 @@ def generate_json(
) -> FlaskResponse:
if response_type == utils.ChartDataResultFormat.CSV:
return CsvResponse(
viz_obj.get_csv(),
status=200,
headers=generate_download_headers("csv"),
mimetype="application/csv",
viz_obj.get_csv(), headers=generate_download_headers("csv")
)

if response_type == utils.ChartDataResultType.QUERY:
Expand Down Expand Up @@ -2622,18 +2619,14 @@ def csv(self, client_id: str) -> FlaskResponse: # pylint: disable=no-self-use
columns = [c["name"] for c in obj["columns"]]
df = pd.DataFrame.from_records(obj["data"], columns=columns)
logger.info("Using pandas to convert to CSV")
csv = df.to_csv(index=False, **config["CSV_EXPORT"])
else:
logger.info("Running a query to turn into CSV")
sql = query.select_sql or query.executed_sql
df = query.database.get_df(sql, query.schema)
# TODO(bkyryliuk): add compression=gzip for big files.
csv = df.to_csv(index=False, **config["CSV_EXPORT"])
response = Response(csv, mimetype="text/csv")
csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
quoted_csv_name = parse.quote(query.name)
response.headers["Content-Disposition"] = (
f'attachment; filename="{quoted_csv_name}.csv"; '
f"filename*=UTF-8''{quoted_csv_name}.csv"
response = CsvResponse(
csv_data, headers=generate_download_headers("csv", quoted_csv_name)
)
event_info = {
"event_type": "data_export",
Expand Down
4 changes: 2 additions & 2 deletions superset/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
from superset.models.cache import CacheKey
from superset.models.helpers import QueryResult
from superset.typing import QueryObjectDict, VizData, VizPayload
from superset.utils import core as utils
from superset.utils import core as utils, csv
from superset.utils.cache import set_and_log_cache
from superset.utils.core import (
DTTM_ALIAS,
Expand Down Expand Up @@ -614,7 +614,7 @@ def data(self) -> Dict[str, Any]:
def get_csv(self) -> Optional[str]:
df = self.get_df_payload()["df"] # leverage caching logic
include_index = not isinstance(df.index, pd.RangeIndex)
return df.to_csv(index=include_index, **config["CSV_EXPORT"])
return csv.df_to_escaped_csv(df, index=include_index, **config["CSV_EXPORT"])

def get_data(self, df: pd.DataFrame) -> VizData:
return df.to_dict(orient="records")
Expand Down
80 changes: 80 additions & 0 deletions tests/utils/csv_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=no-self-use
import io

import pandas as pd
import pytest

from superset.utils import csv


def test_escape_value():
result = csv.escape_value("value")
assert result == "value"

result = csv.escape_value("-10")
assert result == "-10"

result = csv.escape_value("@value")
assert result == "'@value"

result = csv.escape_value("+value")
assert result == "'+value"

result = csv.escape_value("-value")
assert result == "'-value"

result = csv.escape_value("=value")
assert result == "'=value"

result = csv.escape_value("|value")
assert result == "'\|value"

result = csv.escape_value("%value")
assert result == "'%value"

result = csv.escape_value("=cmd|' /C calc'!A0")
assert result == "'=cmd\|' /C calc'!A0"

result = csv.escape_value('""=10+2')
assert result == '\'""=10+2'

result = csv.escape_value(" =10+2")
assert result == "' =10+2"


def test_df_to_escaped_csv():
csv_rows = [
["col_a", "=func()"],
["-10", "=cmd|' /C calc'!A0"],
["a", '""=b'],
[" =a", "b"],
]
csv_str = "\n".join([",".join(row) for row in csv_rows])

df = pd.read_csv(io.StringIO(csv_str))

escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False)
escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")]

assert escaped_csv_rows == [
["col_a", "'=func()"],
["-10", "'=cmd\|' /C calc'!A0"],
["a", "'=b"], # pandas seems to be removing the leading ""
["' =a", "b"],
]

0 comments on commit ed88c9b

Please sign in to comment.