Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: change the validation logic for python_date_format #25510

Merged
merged 21 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions UPDATING.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ assists people when migrating to a new version.
- [26636](https://github.com/apache/superset/issues/26636): Sets the `DASHBOARD_VIRTUALIZATION` feature flag to `True` by default. This feature was introduced by [21438](https://github.com/apache/superset/pull/21438) and will enable virtualization when rendering a dashboard's charts in an attempt to reduce the number of elements (DOM nodes) rendered at once. This is especially useful for large dashboards.
- [26637](https://github.com/apache/superset/issues/26637): Sets the `DRILL_BY` feature flag to `True` by default given that the feature has been tested for a while and reached a stable state.
- [26462](https://github.com/apache/superset/issues/26462): Removes the Profile feature given that it's not actively maintained and not widely used.
- [25510](https://github.com/apache/superset/pull/25510): (re)Restriction that only ISO 8601 formats are acceptable (enforced by way of validation at the API and database level) and that dataset owners will need to use a SQL expression instead to convert their string columns of the form %Y/%m/%d etc. to a DATE, DATETIME, etc. type.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I understand by @john-bodley's comment, this won't be a breaking change. If that's the case, this should be moved to the Next section and the text should be changed given that:

any existing non-ISO 8601 values will still be “valid”.

mapledan marked this conversation as resolved.
Show resolved Hide resolved

### Potential Downtime

Expand Down
12 changes: 12 additions & 0 deletions superset/commands/dataset/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,18 @@ def __init__(self) -> None:
super().__init__([_("One or more columns already exist")], field_name="columns")


class DatasetColumnsConstraintsValidationError(ValidationError):
"""
Marshmallow validation error when dataset columns have an invalid python_date_format
"""

def __init__(self) -> None:
super().__init__(
[_("One or more columns have an invalid python_date_format")],
field_name="columns",
)


class DatasetMetricsNotFoundValidationError(ValidationError):
"""
Marshmallow validation error when dataset metric for update does not exist
Expand Down
8 changes: 8 additions & 0 deletions superset/commands/dataset/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from superset.commands.dataset.exceptions import (
DatabaseChangeValidationError,
DatasetColumnNotFoundValidationError,
DatasetColumnsConstraintsValidationError,
DatasetColumnsDuplicateValidationError,
DatasetColumnsExistsValidationError,
DatasetExistsValidationError,
Expand Down Expand Up @@ -139,6 +140,13 @@ def _validate_columns(
):
exceptions.append(DatasetColumnsExistsValidationError())

# validate python_date_format is ISO8601 format
mapledan marked this conversation as resolved.
Show resolved Hide resolved
for col in columns:
if not DatasetDAO.validate_column_pdf_is_iso8601(
col.get("python_date_format", None)
mapledan marked this conversation as resolved.
Show resolved Hide resolved
):
exceptions.append(DatasetColumnsConstraintsValidationError())

def _validate_metrics(
self, metrics: list[dict[str, Any]], exceptions: list[ValidationError]
) -> None:
Expand Down
12 changes: 12 additions & 0 deletions superset/connectors/sqla/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
relationship,
RelationshipProperty,
Session,
validates,
)
from sqlalchemy.orm.mapper import Mapper
from sqlalchemy.schema import UniqueConstraint
Expand Down Expand Up @@ -830,6 +831,17 @@ def init_on_load(self) -> None:
def __repr__(self) -> str:
return str(self.column_name)

@validates("python_date_format")
def validate_pdf_is_iso8601(self, _: str, dt_format: str) -> str:
mapledan marked this conversation as resolved.
Show resolved Hide resolved
if dt_format in ("epoch_s", "epoch_ms", None):
mapledan marked this conversation as resolved.
Show resolved Hide resolved
return dt_format
mapledan marked this conversation as resolved.
Show resolved Hide resolved
try:
dt_str = datetime.now().strftime(dt_format)
dateutil.parser.isoparse(dt_str)
except ValueError as ex:
raise ValueError("python_date_format is invalid ISO 8601 format") from ex
return dt_format

@property
def is_boolean(self) -> bool:
"""
Expand Down
13 changes: 13 additions & 0 deletions superset/daos/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
from __future__ import annotations

import logging
from datetime import datetime
from typing import Any

import dateutil.parser
from sqlalchemy.exc import SQLAlchemyError

from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn
Expand Down Expand Up @@ -150,6 +152,17 @@ def validate_metrics_uniqueness(dataset_id: int, metrics_names: list[str]) -> bo
).all()
return len(dataset_query) == 0

@staticmethod
def validate_column_pdf_is_iso8601(dt_format: str) -> bool:
if dt_format in ("epoch_s", "epoch_ms", None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See previous comment.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason this logic is somewhat duplicated in three places? Can this be refactored into a helper method if neither the model nor the DAO seems appropriate for both cases?

Copy link
Contributor Author

@mapledan mapledan Jan 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The TableColumn insert/update using bulk_insert_mappings or bulk_update_mappings does not trigger the ORM's @validates.
I refactored the duplicated code to perform validation exclusively at the DAO level.

return True
try:
dt_str = datetime.now().strftime(dt_format)
dateutil.parser.isoparse(dt_str)
return True
except ValueError:
return False

@classmethod
def update(
cls,
Expand Down
30 changes: 15 additions & 15 deletions superset/datasets/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
# specific language governing permissions and limitations
# under the License.
import json
import re
from datetime import datetime
from typing import Any

from dateutil.parser import isoparse
from flask_babel import lazy_gettext as _
from marshmallow import fields, pre_load, Schema, ValidationError
from marshmallow.validate import Length
Expand All @@ -43,26 +44,25 @@
}


def validate_python_date_format(value: str) -> None:
regex = re.compile(
r"""
^(
epoch_s|epoch_ms|
(?P<date>%Y([-/]%m([-/]%d)?)?)([\sT](?P<time>%H(:%M(:%S(\.%f)?)?)?))?
)$
""",
re.VERBOSE,
)
match = regex.match(value or "")
if not match:
raise ValidationError([_("Invalid date/timestamp format")])
def validate_python_date_format(dt_format: str) -> bool:
if dt_format in ("epoch_s", "epoch_ms"):
return True
try:
dt_str = datetime.now().strftime(dt_format)
isoparse(dt_str)
mapledan marked this conversation as resolved.
Show resolved Hide resolved
except ValueError as ex:
raise ValidationError([_("Invalid date/timestamp format")]) from ex
return True


class DatasetColumnsPutSchema(Schema):
id = fields.Integer(required=False)
column_name = fields.String(required=True, validate=Length(1, 255))
type = fields.String(allow_none=True)
advanced_data_type = fields.String(allow_none=True, validate=Length(1, 255))
advanced_data_type = fields.String(
allow_none=True,
validate=Length(1, 255),
)
verbose_name = fields.String(allow_none=True, metadata={Length: (1, 1024)})
description = fields.String(allow_none=True)
expression = fields.String(allow_none=True)
Expand Down
8 changes: 0 additions & 8 deletions tests/unit_tests/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,6 @@ def test_python_date_format_by_column_name(
"dttm_columns": {
"id": {"python_date_format": "epoch_ms"},
"dttm": {"python_date_format": "epoch_s"},
"duration_ms": {"python_date_format": "invalid"},
},
}
mocker.patch(
Expand All @@ -228,7 +227,6 @@ def test_python_date_format_by_column_name(
return_value=[
{"column_name": "id", "type": "INTEGER", "is_dttm": False},
{"column_name": "dttm", "type": "INTEGER", "is_dttm": False},
{"column_name": "duration_ms", "type": "INTEGER", "is_dttm": False},
],
)

Expand All @@ -242,12 +240,6 @@ def test_python_date_format_by_column_name(
assert dttm_col.is_dttm
assert dttm_col.python_date_format == "epoch_s"

duration_ms_col = [c for c in test_table.columns if c.column_name == "duration_ms"][
0
]
assert duration_ms_col.is_dttm
assert duration_ms_col.python_date_format == "invalid"


def test_expression_by_column_name(
mocker: MockerFixture,
Expand Down
48 changes: 48 additions & 0 deletions tests/unit_tests/datasets/schema_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=import-outside-toplevel, invalid-name, unused-argument, redefined-outer-name
import pytest
from marshmallow import ValidationError

from superset.datasets.schemas import validate_python_date_format


# pylint: disable=too-few-public-methods
@pytest.mark.parametrize(
"payload",
[
"epoch_ms",
"epoch_s",
"%Y-%m-%dT%H:%M:%S.%f",
"%Y%m%d",
],
)
def test_validate_python_date_format(payload) -> None:
assert validate_python_date_format(payload)


@pytest.mark.parametrize(
"payload",
[
"%d%m%Y",
"%Y/%m/%dT%H:%M:%S.%f",
],
)
def test_validate_python_date_format_raises(payload) -> None:
with pytest.raises(ValidationError):
validate_python_date_format(payload)