Skip to content

Preprocessing

preprocessing

series_semantic

constant_series(series, constant)

Create a series with a constant value for each row of series.

Source code in preprocessing/series_semantic.py
220
221
222
def constant_series(series: pl.Series, constant) -> pl.Series:
    """Create a series with a constant value for each row of `series`."""
    return pl.Series([constant] * series.len(), dtype=pl.Boolean)

parse_datetime_with_tz(s)

Parse datetime strings with timezone info (both abbreviations and offsets)

Source code in preprocessing/series_semantic.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def parse_datetime_with_tz(s: pl.Series) -> pl.Series:
    """Parse datetime strings with timezone info (both abbreviations and offsets)"""
    import warnings

    # Handle timezone abbreviations like "UTC", "EST"
    tz_abbrev_regex = r" ([A-Z]{3,4})$"  # UTC, EST, etc.

    # Handle timezone offsets like "-05:00", "+00:00"
    tz_offset_regex = r"[+-]\d{2}:\d{2}$"  # -05:00, +00:00, etc.

    # Check for multiple different timezones
    abbrev_matches = s.str.extract_all(tz_abbrev_regex)
    offset_matches = s.str.extract_all(tz_offset_regex)

    # Get unique timezone abbreviations
    unique_abbrevs = set()
    if not abbrev_matches.is_empty():
        for match_list in abbrev_matches.to_list():
            if match_list:  # Not empty
                unique_abbrevs.update(match_list)

    # Get unique timezone offsets
    unique_offsets = set()
    if not offset_matches.is_empty():
        for match_list in offset_matches.to_list():
            if match_list:  # Not empty
                unique_offsets.update(match_list)

    # Warn if multiple different timezones found
    total_unique_tz = len(unique_abbrevs) + len(unique_offsets)
    if total_unique_tz > 1:
        all_tz = list(unique_abbrevs) + list(unique_offsets)
        warnings.warn(
            f"Multiple timezones found in datetime column: {all_tz}. "
            f"Assuming all timestamps represent the same timezone for analysis purposes.",
            UserWarning,
        )

    # Try to remove timezone abbreviations first
    result = s.str.replace(tz_abbrev_regex, "")

    # Then remove timezone offsets
    result = result.str.replace(tz_offset_regex, "")

    return result.str.strptime(pl.Datetime(), strict=False)

test_series_semantic

test_all_none_series()

Test series with all null values

Source code in preprocessing/test_series_semantic.py
190
191
192
193
194
def test_all_none_series():
    """Test series with all null values"""
    null_series = pl.Series([None, None, None], dtype=pl.String)
    semantic = infer_series_semantic(null_series)
    assert semantic.semantic_name == "free_text"

test_date_string_parsing()

Test date-only string parsing

Source code in preprocessing/test_series_semantic.py
43
44
45
46
47
48
49
50
def test_date_string_parsing():
    """Test date-only string parsing"""
    series = pl.Series(["2025-02-28", "2025-02-27"])
    assert date_string.check(series)

    result = date_string.try_convert(series)
    assert result.dtype == pl.Date
    assert result.is_not_null().all()

test_datetime_timezone_inference()

Test main inference function with timezone datetime

Source code in preprocessing/test_series_semantic.py
78
79
80
81
82
83
84
85
def test_datetime_timezone_inference():
    """Test main inference function with timezone datetime"""
    series = pl.Series(["2025-02-28 00:36:15 UTC"] * 10)
    semantic = infer_series_semantic(series)

    assert semantic is not None
    assert semantic.semantic_name == "datetime"
    assert semantic.data_type == "datetime"

test_datetime_with_timezone_offset_parsing()

Test timezone offset datetime strings get recognized as datetime semantic

Source code in preprocessing/test_series_semantic.py
145
146
147
148
149
150
151
152
153
154
155
def test_datetime_with_timezone_offset_parsing():
    """Test timezone offset datetime strings get recognized as datetime semantic"""
    series = pl.Series(
        ["2025-01-27 00:07:12.056000-05:00", "2025-01-27 00:07:16.126000-05:00"]
    )
    assert datetime_string.check(series)

    # Test conversion
    result = datetime_string.try_convert(series)
    assert result.dtype == pl.Datetime
    assert result.is_not_null().all()

test_datetime_with_timezone_parsing()

Test parsing datetime strings with timezone

Source code in preprocessing/test_series_semantic.py
32
33
34
35
36
37
38
39
40
def test_datetime_with_timezone_parsing():
    """Test parsing datetime strings with timezone"""
    series = pl.Series(["2025-02-28 00:36:15 UTC", "2025-02-28 00:36:13 UTC"])
    assert datetime_string.check(series)

    # Test conversion
    result = datetime_string.try_convert(series)
    assert result.dtype == pl.Datetime
    assert result.is_not_null().all()

test_mixed_valid_invalid_dates()

Test series with mix of valid and invalid datetime strings

Source code in preprocessing/test_series_semantic.py
197
198
199
200
201
202
203
204
205
def test_mixed_valid_invalid_dates():
    """Test series with mix of valid and invalid datetime strings"""
    mixed_series = pl.Series(["2025-01-01 UTC", "invalid_date", "2025-01-02 UTC"])
    semantic = infer_series_semantic(mixed_series)
    # should be free_text with threshold 0.8
    assert semantic.semantic_name == "free_text"

    semantic = infer_series_semantic(series=mixed_series, threshold=0.2)
    assert semantic.semantic_name == "datetime"

test_native_date_recognition()

Test that pl.Date columns get recognized correctly

Source code in preprocessing/test_series_semantic.py
18
19
20
21
22
23
def test_native_date_recognition():
    """Test that pl.Date columns get recognized correctly"""
    series = pl.Series(
        [datetime(2025, 1, 1).date(), datetime(2025, 1, 2).date()], dtype=pl.Date
    )
    assert native_date.check(series)

test_native_datetime_recognition()

Test that pl.Datetime columns get recognized correctly

Source code in preprocessing/test_series_semantic.py
26
27
28
29
def test_native_datetime_recognition():
    """Test that pl.Datetime columns get recognized correctly"""
    series = pl.Series([datetime(2025, 1, 1, 12, 0), datetime(2025, 1, 2, 13, 0)])
    assert native_datetime.check(series)

test_native_types_inference()

Test that native temporal types get recognized

Source code in preprocessing/test_series_semantic.py
88
89
90
91
92
93
94
def test_native_types_inference():
    """Test that native temporal types get recognized"""
    date_series = pl.Series([datetime(2025, 1, 1).date()] * 10, dtype=pl.Date)
    datetime_series = pl.Series([datetime(2025, 1, 1, 12, 0)] * 10)

    assert infer_series_semantic(date_series).semantic_name == "native_date"
    assert infer_series_semantic(datetime_series).semantic_name == "native_datetime"

test_parse_datetime_mixed_timezones_warning()

Test that mixed timezones trigger a warning

Source code in preprocessing/test_series_semantic.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def test_parse_datetime_mixed_timezones_warning():
    """Test that mixed timezones trigger a warning"""
    import warnings

    # Series with mixed timezone abbreviations and offsets
    series = pl.Series(
        [
            "2025-01-27 00:07:12 UTC",
            "2025-01-27 00:07:16-05:00",
            "2025-01-27 00:07:20 EST",
        ]
    )

    # Capture warnings
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        result = parse_datetime_with_tz(series)

        # Should have issued a warning
        assert len(w) == 1
        assert issubclass(w[0].category, UserWarning)
        assert "Multiple timezones found" in str(w[0].message)
        assert "UTC" in str(w[0].message)
        assert "-05:00" in str(w[0].message)
        assert "EST" in str(w[0].message)

    # But parsing should still work
    assert result.dtype == pl.Datetime
    assert result.is_not_null().all()

test_parse_datetime_with_tz()

Test timezone parsing helper function

Source code in preprocessing/test_series_semantic.py
113
114
115
116
117
118
119
120
121
122
def test_parse_datetime_with_tz():
    """Test timezone parsing helper function"""

    # Use same timezone to avoid warning
    series = pl.Series(["2025-02-28 00:36:15 UTC", "2025-02-28 00:36:13 UTC"])
    result = parse_datetime_with_tz(series)

    assert isinstance(result, pl.Series)
    assert result.dtype == pl.Datetime
    assert result.is_not_null().all()

test_parse_datetime_with_tz_no_timezone()

Test datetime parsing without timezone suffix

Source code in preprocessing/test_series_semantic.py
125
126
127
128
129
130
131
def test_parse_datetime_with_tz_no_timezone():
    """Test datetime parsing without timezone suffix"""
    series = pl.Series(["2025-02-28 00:36:15", "2025-02-28 00:36:13"])
    result = parse_datetime_with_tz(series)

    assert result.dtype == pl.Datetime
    assert result.is_not_null().all()

test_parse_datetime_with_tz_offset()

Test datetime parsing with timezone offset format

Source code in preprocessing/test_series_semantic.py
134
135
136
137
138
139
140
141
142
def test_parse_datetime_with_tz_offset():
    """Test datetime parsing with timezone offset format"""
    series = pl.Series(
        ["2025-01-27 00:07:12.056000-05:00", "2025-01-27 00:07:16.126000-05:00"]
    )
    result = parse_datetime_with_tz(series)

    assert result.dtype == pl.Datetime
    assert result.is_not_null().all()

test_text_catch_all()

Test free form text parsing

Source code in preprocessing/test_series_semantic.py
63
64
65
66
67
68
69
70
71
72
73
74
75
def test_text_catch_all():
    """Test free form text parsing"""
    series = pl.Series(
        ["First post with some content!", "a different no all caps post", "THIRD POST"]
    )
    assert text_catch_all.check(series)

    result = text_catch_all.try_convert(series)
    assert result.dtype == pl.String

    semantic = infer_series_semantic(series)
    assert semantic.semantic_name == "free_text"
    assert semantic.data_type == "text"

test_threshold_behavior()

Test that recognition threshold works correctly

Source code in preprocessing/test_series_semantic.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def test_threshold_behavior():
    """Test that recognition threshold works correctly"""

    # 70% of data is date, the rest is text
    mixed_series = pl.Series(["2025-01-01 UTC"] * 7 + ["not_a_date"] * 3)

    # Should work with the right threshold (i.e. > 50%)
    semantic_low = infer_series_semantic(mixed_series, threshold=0.5)
    assert semantic_low.semantic_name == "datetime"

    # Check a threshold that's not met, should defer to free_text
    semantic_high = infer_series_semantic(mixed_series, threshold=0.8)
    assert semantic_high.semantic_name == "free_text"

test_time_string_parsing()

Test time-only string parsing

Source code in preprocessing/test_series_semantic.py
53
54
55
56
57
58
59
60
def test_time_string_parsing():
    """Test time-only string parsing"""
    series = pl.Series(["14:30:15", "09:15:30"])
    assert time_string.check(series)

    result = time_string.try_convert(series)
    assert result.dtype == pl.Time
    assert result.is_not_null().all()