Skip to content

Preprocessing

preprocessing

Modules:

Name Description
series_semantic

Modules

series_semantic

Functions:

Name Description
constant_series

Create a series with a constant value for each row of series.

parse_datetime_with_tz

Parse datetime strings with timezone info (both abbreviations and offsets)

Attributes
Functions
constant_series(series, constant)

Create a series with a constant value for each row of series.

Source code in preprocessing/series_semantic.py
220
221
222
def constant_series(series: pl.Series, constant) -> pl.Series:
    """Create a series with a constant value for each row of `series`."""
    return pl.Series([constant] * series.len(), dtype=pl.Boolean)
parse_datetime_with_tz(s)

Parse datetime strings with timezone info (both abbreviations and offsets)

Source code in preprocessing/series_semantic.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def parse_datetime_with_tz(s: pl.Series) -> pl.Series:
    """Parse datetime strings with timezone info (both abbreviations and offsets)"""
    import warnings

    # Handle timezone abbreviations like "UTC", "EST"
    tz_abbrev_regex = r" ([A-Z]{3,4})$"  # UTC, EST, etc.

    # Handle timezone offsets like "-05:00", "+00:00"
    tz_offset_regex = r"[+-]\d{2}:\d{2}$"  # -05:00, +00:00, etc.

    # Check for multiple different timezones
    abbrev_matches = s.str.extract_all(tz_abbrev_regex)
    offset_matches = s.str.extract_all(tz_offset_regex)

    # Get unique timezone abbreviations
    unique_abbrevs = set()
    if not abbrev_matches.is_empty():
        for match_list in abbrev_matches.to_list():
            if match_list:  # Not empty
                unique_abbrevs.update(match_list)

    # Get unique timezone offsets
    unique_offsets = set()
    if not offset_matches.is_empty():
        for match_list in offset_matches.to_list():
            if match_list:  # Not empty
                unique_offsets.update(match_list)

    # Warn if multiple different timezones found
    total_unique_tz = len(unique_abbrevs) + len(unique_offsets)
    if total_unique_tz > 1:
        all_tz = list(unique_abbrevs) + list(unique_offsets)
        warnings.warn(
            f"Multiple timezones found in datetime column: {all_tz}. "
            f"Assuming all timestamps represent the same timezone for analysis purposes.",
            UserWarning,
        )

    # Try to remove timezone abbreviations first
    result = s.str.replace(tz_abbrev_regex, "")

    # Then remove timezone offsets
    result = result.str.replace(tz_offset_regex, "")

    return result.str.strptime(pl.Datetime(), strict=False)