Skip to content

Preprocessing

preprocessing

Modules:

Name Description
series_semantic

Modules

series_semantic

Functions:

Name Description
constant_series

Create a series with a constant value for each row of series.

parse_datetime_with_tz

Parse datetime strings with timezone info (both abbreviations and offsets)

parse_time_military

Parse time strings with multiple format attempts

Attributes
Functions
constant_series(series, constant)

Create a series with a constant value for each row of series.

Source code in preprocessing/series_semantic.py
246
247
248
def constant_series(series: pl.Series, constant) -> pl.Series:
    """Create a series with a constant value for each row of `series`."""
    return pl.Series([constant] * series.len(), dtype=pl.Boolean)
parse_datetime_with_tz(s)

Parse datetime strings with timezone info (both abbreviations and offsets)

Source code in preprocessing/series_semantic.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def parse_datetime_with_tz(s: pl.Series) -> pl.Series:
    """Parse datetime strings with timezone info (both abbreviations and offsets)"""
    import warnings

    # Handle timezone abbreviations like "UTC", "EST"
    tz_abbrev_regex = r" ([A-Z]{3,4})$"  # UTC, EST, etc.

    # Handle timezone offsets like "-05:00", "+00:00"
    tz_offset_regex = r"[+-]\d{2}:\d{2}$"  # -05:00, +00:00, etc.

    # Check for multiple different timezones
    abbrev_matches = s.str.extract_all(tz_abbrev_regex)
    offset_matches = s.str.extract_all(tz_offset_regex)

    # Get unique timezone abbreviations
    unique_abbrevs = set()
    if not abbrev_matches.is_empty():
        for match_list in abbrev_matches.to_list():
            if match_list:  # Not empty
                unique_abbrevs.update(match_list)

    # Get unique timezone offsets
    unique_offsets = set()
    if not offset_matches.is_empty():
        for match_list in offset_matches.to_list():
            if match_list:  # Not empty
                unique_offsets.update(match_list)

    # Warn if multiple different timezones found
    total_unique_tz = len(unique_abbrevs) + len(unique_offsets)
    if total_unique_tz > 1:
        all_tz = list(unique_abbrevs) + list(unique_offsets)
        warnings.warn(
            f"Multiple timezones found in datetime column: {all_tz}. "
            f"Assuming all timestamps represent the same timezone for analysis purposes.",
            UserWarning,
        )

    # Try to remove timezone abbreviations first
    result = s.str.replace(tz_abbrev_regex, "")

    # Then remove timezone offsets
    result = result.str.replace(tz_offset_regex, "")

    return result.str.strptime(pl.Datetime(), strict=False)
parse_time_military(s)

Parse time strings with multiple format attempts

Source code in preprocessing/series_semantic.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def parse_time_military(s: pl.Series) -> pl.Series:
    """Parse time strings with multiple format attempts"""
    # Try different time formats
    FORMATS_TO_TRY = ["%H:%M:%S", "%H:%M", "%I:%M:%S %p", "%I:%M %p"]

    for fmt in FORMATS_TO_TRY:
        try:
            result = s.str.strptime(pl.Time, format=fmt, strict=False)
            if result.is_not_null().sum() > 0:  # If any parsed successfully
                return result
        except:
            continue

    # If all formats fail, return nulls
    return pl.Series([None] * s.len(), dtype=pl.Time)