Skip to content

Tokenizer

services.tokenizer.core

Core tokenizer components.

Exports: - AbstractTokenizer - TokenizerConfig - TokenList - LanguageFamily - TokenType - CaseHandling

Intended usage

from services.tokenizer.core import ( AbstractTokenizer, TokenizerConfig, TokenList, LanguageFamily, TokenType, CaseHandling, )

Modules:

Name Description
base

AbstractTokenizer abstract base class

types

TokenizerConfig, enums, and shared types

Classes:

Name Description
AbstractTokenizer

Abstract base class for all tokenizer implementations.

CaseHandling

How to handle character case during tokenization.

LanguageFamily

Language families that affect tokenization strategies.

TokenType

Types of tokens that can be extracted.

TokenizerConfig

Configuration for tokenizer behavior.

AbstractTokenizer

Bases: ABC

Abstract base class for all tokenizer implementations.

This class defines the core interface that all tokenizer plugins must implement. It provides a clean contract for tokenization operations while allowing for different implementation strategies.

Methods:

Name Description
__init__

Initialize the tokenizer with configuration.

tokenize

Tokenize input text into a list of tokens.

Attributes:

Name Type Description
config TokenizerConfig

Get the current tokenizer configuration.

Source code in services/tokenizer/core/base.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
class AbstractTokenizer(ABC):
    """
    Abstract base class for all tokenizer implementations.

    This class defines the core interface that all tokenizer plugins must implement.
    It provides a clean contract for tokenization operations while allowing for
    different implementation strategies.
    """

    def __init__(self, config: Optional[TokenizerConfig] = None):
        """
        Initialize the tokenizer with configuration.

        Args:
            config: Tokenizer configuration. If None, default config will be used.
        """
        self._config = config or TokenizerConfig()

    @property
    def config(self) -> TokenizerConfig:
        """Get the current tokenizer configuration."""
        return self._config

    @abstractmethod
    def tokenize(self, text: str) -> TokenList:
        """
        Tokenize input text into a list of tokens.

        This is the main tokenization method that all implementations must provide.

        Args:
            text: Input text to tokenize

        Returns:
            List of tokens extracted from the input text
        """
        pass

    def _preprocess_text(self, text: str) -> str:
        """
        Apply preprocessing to text before tokenization.

        This method applies configuration-based preprocessing such as
        case handling and Unicode normalization.

        Args:
            text: Input text to preprocess

        Returns:
            Preprocessed text
        """
        if not text:
            return text

        # Apply Unicode normalization
        if self._config.normalize_unicode:
            import unicodedata

            text = unicodedata.normalize("NFKC", text)

        # Apply case handling
        from .types import CaseHandling

        if self._config.case_handling == CaseHandling.LOWERCASE:
            text = text.lower()
        elif self._config.case_handling == CaseHandling.UPPERCASE:
            text = text.upper()
        elif self._config.case_handling == CaseHandling.NORMALIZE:
            # TODO: Implement proper noun detection for smart normalization
            # Currently using simple lowercase as a placeholder
            text = text.lower()

        return text

    def _postprocess_tokens(self, tokens: TokenList) -> TokenList:
        """
        Apply post-processing to extracted tokens.

        This method applies configuration-based filtering and cleanup
        to the token list.

        Args:
            tokens: List of raw tokens

        Returns:
            Processed token list
        """
        if not tokens:
            return tokens

        processed_tokens = []

        for token in tokens:
            # Strip whitespace if configured
            if self._config.strip_whitespace:
                token = token.strip()

            # Skip empty tokens
            if not token:
                continue

            # Filter emojis if not included
            if not self._config.include_emoji and self._is_emoji(token):
                continue

            # Apply length filtering
            if len(token) < self._config.min_token_length:
                continue

            if (
                self._config.max_token_length is not None
                and len(token) > self._config.max_token_length
            ):
                continue

            processed_tokens.append(token)

        return processed_tokens

    @staticmethod
    def _is_emoji(token: str) -> bool:
        """
        Check if a token is an emoji character.

        Args:
            token: Token to check

        Returns:
            True if the token is an emoji, False otherwise
        """
        if not token:
            return False

        # Accept sequences made of emoji code points plus common modifiers
        EMOJI_RANGES = (
            (0x1F600, 0x1F64F),  # Emoticons
            (0x1F300, 0x1F5FF),  # Misc Symbols & Pictographs
            (0x1F680, 0x1F6FF),  # Transport & Map
            (0x1F1E6, 0x1F1FF),  # Regional Indicators
            (0x2600, 0x26FF),  # Misc symbols
            (0x2700, 0x27BF),  # Dingbats
            (0x1F900, 0x1F9FF),  # Supplemental Symbols & Pictographs
            (0x1FA70, 0x1FAFF),  # Symbols & Pictographs Extended-A
        )
        MODIFIERS = {0x200D, 0xFE0E, 0xFE0F}  # ZWJ, VS15, VS16
        SKIN_TONE = (0x1F3FB, 0x1F3FF)
        TAGS = (0xE0020, 0xE007F)  # Emoji tag sequences

        def in_any_range(cp: int, ranges) -> bool:
            for a, b in ranges:
                if a <= cp <= b:
                    return True
            return False

        def is_modifier(cp: int) -> bool:
            return (
                cp in MODIFIERS
                or SKIN_TONE[0] <= cp <= SKIN_TONE[1]
                or TAGS[0] <= cp <= TAGS[1]
            )

        for ch in token:
            cp = ord(ch)
            if not (in_any_range(cp, EMOJI_RANGES) or is_modifier(cp)):
                return False
        return True

__init__(config=None)

Initialize the tokenizer with configuration.

Parameters:

Name Type Description Default
config
Optional[TokenizerConfig]

Tokenizer configuration. If None, default config will be used.

None
Source code in services/tokenizer/core/base.py
23
24
25
26
27
28
29
30
def __init__(self, config: Optional[TokenizerConfig] = None):
    """
    Initialize the tokenizer with configuration.

    Args:
        config: Tokenizer configuration. If None, default config will be used.
    """
    self._config = config or TokenizerConfig()

config property

Get the current tokenizer configuration.

tokenize(text) abstractmethod

Tokenize input text into a list of tokens.

This is the main tokenization method that all implementations must provide.

Parameters:

Name Type Description Default
text
str

Input text to tokenize

required

Returns:

Type Description
TokenList

List of tokens extracted from the input text

Source code in services/tokenizer/core/base.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@abstractmethod
def tokenize(self, text: str) -> TokenList:
    """
    Tokenize input text into a list of tokens.

    This is the main tokenization method that all implementations must provide.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens extracted from the input text
    """
    pass

CaseHandling

Bases: Enum

How to handle character case during tokenization.

Source code in services/tokenizer/core/types.py
38
39
40
41
42
43
44
class CaseHandling(Enum):
    """How to handle character case during tokenization."""

    PRESERVE = "preserve"  # Keep original case
    LOWERCASE = "lowercase"  # Convert to lowercase
    UPPERCASE = "uppercase"  # Convert to uppercase
    NORMALIZE = "normalize"  # Smart case normalization

LanguageFamily

Bases: str, Enum

Language families that affect tokenization strategies.

Source code in services/tokenizer/core/types.py
14
15
16
17
18
19
20
21
class LanguageFamily(str, Enum):
    """Language families that affect tokenization strategies."""

    LATIN = "latin"  # Space-separated languages (English, French, etc.)
    CJK = "cjk"  # Chinese, Japanese, Korean
    ARABIC = "arabic"  # Arabic script languages
    MIXED = "mixed"  # Mixed content requiring multiple strategies
    UNKNOWN = "unknown"  # Language detection failed or not performed

TokenType

Bases: str, Enum

Types of tokens that can be extracted.

Source code in services/tokenizer/core/types.py
24
25
26
27
28
29
30
31
32
33
34
35
class TokenType(str, Enum):
    """Types of tokens that can be extracted."""

    WORD = "word"  # Regular words
    PUNCTUATION = "punctuation"  # Punctuation marks
    NUMERIC = "numeric"  # Numbers
    EMOJI = "emoji"  # Emoji characters
    HASHTAG = "hashtag"  # Social media hashtags
    MENTION = "mention"  # Social media mentions
    URL = "url"  # URLs and links
    EMAIL = "email"  # Email addresses
    WHITESPACE = "whitespace"  # Whitespace (when preserved)

TokenizerConfig pydantic-model

Bases: BaseModel

Configuration for tokenizer behavior.

Controls all aspects of text tokenization including script handling, social media entity processing, and output formatting.

Social Media Entity Behavior: - extract_hashtags/extract_mentions: When False, splits into component words - include_urls/include_emails: When False, completely excludes (no fragmentation)

Fields:

Source code in services/tokenizer/core/types.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
class TokenizerConfig(BaseModel):
    """Configuration for tokenizer behavior.

    Controls all aspects of text tokenization including script handling,
    social media entity processing, and output formatting.

    Social Media Entity Behavior:
    - extract_hashtags/extract_mentions: When False, splits into component words
    - include_urls/include_emails: When False, completely excludes (no fragmentation)
    """

    # Language detection settings
    fallback_language_family: LanguageFamily = LanguageFamily.MIXED
    """Default language family when detection fails or mixed content is found."""

    # Token type filtering
    include_punctuation: bool = False
    """Whether to include punctuation marks as separate tokens."""

    include_numeric: bool = True
    """Whether to include numeric tokens (integers, decimals, etc.)."""

    include_emoji: bool = False
    """Whether to include emoji characters as tokens."""

    # Text preprocessing
    case_handling: CaseHandling = CaseHandling.LOWERCASE
    """How to handle character case during tokenization."""

    normalize_unicode: bool = True
    """Whether to apply Unicode NFKC normalization for consistent character representation."""

    # Social media features
    extract_hashtags: bool = True
    """Whether to preserve hashtags as single tokens. If False, splits into component words."""

    extract_mentions: bool = True
    """Whether to preserve @mentions as single tokens. If False, splits into component words."""

    include_urls: bool = True
    """Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented)."""

    include_emails: bool = True
    """Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented)."""

    # Output formatting
    min_token_length: int = 1
    """Minimum length for tokens to be included in output."""

    max_token_length: Optional[int] = None
    """Maximum length for tokens. If None, no length limit is applied."""

    strip_whitespace: bool = True
    """Whether to strip leading/trailing whitespace from tokens."""

case_handling = CaseHandling.LOWERCASE pydantic-field

How to handle character case during tokenization.

extract_hashtags = True pydantic-field

Whether to preserve hashtags as single tokens. If False, splits into component words.

extract_mentions = True pydantic-field

Whether to preserve @mentions as single tokens. If False, splits into component words.

fallback_language_family = LanguageFamily.MIXED pydantic-field

Default language family when detection fails or mixed content is found.

include_emails = True pydantic-field

Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented).

include_emoji = False pydantic-field

Whether to include emoji characters as tokens.

include_numeric = True pydantic-field

Whether to include numeric tokens (integers, decimals, etc.).

include_punctuation = False pydantic-field

Whether to include punctuation marks as separate tokens.

include_urls = True pydantic-field

Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented).

max_token_length = None pydantic-field

Maximum length for tokens. If None, no length limit is applied.

min_token_length = 1 pydantic-field

Minimum length for tokens to be included in output.

normalize_unicode = True pydantic-field

Whether to apply Unicode NFKC normalization for consistent character representation.

strip_whitespace = True pydantic-field

Whether to strip leading/trailing whitespace from tokens.

base

AbstractTokenizer abstract base class

This module contains the abstract base class that defines the interface for all tokenizer implementations.

Classes:

Name Description
AbstractTokenizer

Abstract base class for all tokenizer implementations.

AbstractTokenizer

Bases: ABC

Abstract base class for all tokenizer implementations.

This class defines the core interface that all tokenizer plugins must implement. It provides a clean contract for tokenization operations while allowing for different implementation strategies.

Methods:

Name Description
__init__

Initialize the tokenizer with configuration.

tokenize

Tokenize input text into a list of tokens.

Attributes:

Name Type Description
config TokenizerConfig

Get the current tokenizer configuration.

Source code in services/tokenizer/core/base.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
class AbstractTokenizer(ABC):
    """
    Abstract base class for all tokenizer implementations.

    This class defines the core interface that all tokenizer plugins must implement.
    It provides a clean contract for tokenization operations while allowing for
    different implementation strategies.
    """

    def __init__(self, config: Optional[TokenizerConfig] = None):
        """
        Initialize the tokenizer with configuration.

        Args:
            config: Tokenizer configuration. If None, default config will be used.
        """
        self._config = config or TokenizerConfig()

    @property
    def config(self) -> TokenizerConfig:
        """Get the current tokenizer configuration."""
        return self._config

    @abstractmethod
    def tokenize(self, text: str) -> TokenList:
        """
        Tokenize input text into a list of tokens.

        This is the main tokenization method that all implementations must provide.

        Args:
            text: Input text to tokenize

        Returns:
            List of tokens extracted from the input text
        """
        pass

    def _preprocess_text(self, text: str) -> str:
        """
        Apply preprocessing to text before tokenization.

        This method applies configuration-based preprocessing such as
        case handling and Unicode normalization.

        Args:
            text: Input text to preprocess

        Returns:
            Preprocessed text
        """
        if not text:
            return text

        # Apply Unicode normalization
        if self._config.normalize_unicode:
            import unicodedata

            text = unicodedata.normalize("NFKC", text)

        # Apply case handling
        from .types import CaseHandling

        if self._config.case_handling == CaseHandling.LOWERCASE:
            text = text.lower()
        elif self._config.case_handling == CaseHandling.UPPERCASE:
            text = text.upper()
        elif self._config.case_handling == CaseHandling.NORMALIZE:
            # TODO: Implement proper noun detection for smart normalization
            # Currently using simple lowercase as a placeholder
            text = text.lower()

        return text

    def _postprocess_tokens(self, tokens: TokenList) -> TokenList:
        """
        Apply post-processing to extracted tokens.

        This method applies configuration-based filtering and cleanup
        to the token list.

        Args:
            tokens: List of raw tokens

        Returns:
            Processed token list
        """
        if not tokens:
            return tokens

        processed_tokens = []

        for token in tokens:
            # Strip whitespace if configured
            if self._config.strip_whitespace:
                token = token.strip()

            # Skip empty tokens
            if not token:
                continue

            # Filter emojis if not included
            if not self._config.include_emoji and self._is_emoji(token):
                continue

            # Apply length filtering
            if len(token) < self._config.min_token_length:
                continue

            if (
                self._config.max_token_length is not None
                and len(token) > self._config.max_token_length
            ):
                continue

            processed_tokens.append(token)

        return processed_tokens

    @staticmethod
    def _is_emoji(token: str) -> bool:
        """
        Check if a token is an emoji character.

        Args:
            token: Token to check

        Returns:
            True if the token is an emoji, False otherwise
        """
        if not token:
            return False

        # Accept sequences made of emoji code points plus common modifiers
        EMOJI_RANGES = (
            (0x1F600, 0x1F64F),  # Emoticons
            (0x1F300, 0x1F5FF),  # Misc Symbols & Pictographs
            (0x1F680, 0x1F6FF),  # Transport & Map
            (0x1F1E6, 0x1F1FF),  # Regional Indicators
            (0x2600, 0x26FF),  # Misc symbols
            (0x2700, 0x27BF),  # Dingbats
            (0x1F900, 0x1F9FF),  # Supplemental Symbols & Pictographs
            (0x1FA70, 0x1FAFF),  # Symbols & Pictographs Extended-A
        )
        MODIFIERS = {0x200D, 0xFE0E, 0xFE0F}  # ZWJ, VS15, VS16
        SKIN_TONE = (0x1F3FB, 0x1F3FF)
        TAGS = (0xE0020, 0xE007F)  # Emoji tag sequences

        def in_any_range(cp: int, ranges) -> bool:
            for a, b in ranges:
                if a <= cp <= b:
                    return True
            return False

        def is_modifier(cp: int) -> bool:
            return (
                cp in MODIFIERS
                or SKIN_TONE[0] <= cp <= SKIN_TONE[1]
                or TAGS[0] <= cp <= TAGS[1]
            )

        for ch in token:
            cp = ord(ch)
            if not (in_any_range(cp, EMOJI_RANGES) or is_modifier(cp)):
                return False
        return True
__init__(config=None)

Initialize the tokenizer with configuration.

Parameters:

Name Type Description Default
config
Optional[TokenizerConfig]

Tokenizer configuration. If None, default config will be used.

None
Source code in services/tokenizer/core/base.py
23
24
25
26
27
28
29
30
def __init__(self, config: Optional[TokenizerConfig] = None):
    """
    Initialize the tokenizer with configuration.

    Args:
        config: Tokenizer configuration. If None, default config will be used.
    """
    self._config = config or TokenizerConfig()
config property

Get the current tokenizer configuration.

tokenize(text) abstractmethod

Tokenize input text into a list of tokens.

This is the main tokenization method that all implementations must provide.

Parameters:

Name Type Description Default
text
str

Input text to tokenize

required

Returns:

Type Description
TokenList

List of tokens extracted from the input text

Source code in services/tokenizer/core/base.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@abstractmethod
def tokenize(self, text: str) -> TokenList:
    """
    Tokenize input text into a list of tokens.

    This is the main tokenization method that all implementations must provide.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens extracted from the input text
    """
    pass

types

TokenizerConfig, enums, and shared types

This module contains configuration models, enumerations, and shared type definitions used across the tokenizer service.

Classes:

Name Description
CaseHandling

How to handle character case during tokenization.

LanguageFamily

Language families that affect tokenization strategies.

TokenType

Types of tokens that can be extracted.

TokenizerConfig

Configuration for tokenizer behavior.

CaseHandling

Bases: Enum

How to handle character case during tokenization.

Source code in services/tokenizer/core/types.py
38
39
40
41
42
43
44
class CaseHandling(Enum):
    """How to handle character case during tokenization."""

    PRESERVE = "preserve"  # Keep original case
    LOWERCASE = "lowercase"  # Convert to lowercase
    UPPERCASE = "uppercase"  # Convert to uppercase
    NORMALIZE = "normalize"  # Smart case normalization

LanguageFamily

Bases: str, Enum

Language families that affect tokenization strategies.

Source code in services/tokenizer/core/types.py
14
15
16
17
18
19
20
21
class LanguageFamily(str, Enum):
    """Language families that affect tokenization strategies."""

    LATIN = "latin"  # Space-separated languages (English, French, etc.)
    CJK = "cjk"  # Chinese, Japanese, Korean
    ARABIC = "arabic"  # Arabic script languages
    MIXED = "mixed"  # Mixed content requiring multiple strategies
    UNKNOWN = "unknown"  # Language detection failed or not performed

TokenType

Bases: str, Enum

Types of tokens that can be extracted.

Source code in services/tokenizer/core/types.py
24
25
26
27
28
29
30
31
32
33
34
35
class TokenType(str, Enum):
    """Types of tokens that can be extracted."""

    WORD = "word"  # Regular words
    PUNCTUATION = "punctuation"  # Punctuation marks
    NUMERIC = "numeric"  # Numbers
    EMOJI = "emoji"  # Emoji characters
    HASHTAG = "hashtag"  # Social media hashtags
    MENTION = "mention"  # Social media mentions
    URL = "url"  # URLs and links
    EMAIL = "email"  # Email addresses
    WHITESPACE = "whitespace"  # Whitespace (when preserved)

TokenizerConfig pydantic-model

Bases: BaseModel

Configuration for tokenizer behavior.

Controls all aspects of text tokenization including script handling, social media entity processing, and output formatting.

Social Media Entity Behavior: - extract_hashtags/extract_mentions: When False, splits into component words - include_urls/include_emails: When False, completely excludes (no fragmentation)

Fields:

Source code in services/tokenizer/core/types.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
class TokenizerConfig(BaseModel):
    """Configuration for tokenizer behavior.

    Controls all aspects of text tokenization including script handling,
    social media entity processing, and output formatting.

    Social Media Entity Behavior:
    - extract_hashtags/extract_mentions: When False, splits into component words
    - include_urls/include_emails: When False, completely excludes (no fragmentation)
    """

    # Language detection settings
    fallback_language_family: LanguageFamily = LanguageFamily.MIXED
    """Default language family when detection fails or mixed content is found."""

    # Token type filtering
    include_punctuation: bool = False
    """Whether to include punctuation marks as separate tokens."""

    include_numeric: bool = True
    """Whether to include numeric tokens (integers, decimals, etc.)."""

    include_emoji: bool = False
    """Whether to include emoji characters as tokens."""

    # Text preprocessing
    case_handling: CaseHandling = CaseHandling.LOWERCASE
    """How to handle character case during tokenization."""

    normalize_unicode: bool = True
    """Whether to apply Unicode NFKC normalization for consistent character representation."""

    # Social media features
    extract_hashtags: bool = True
    """Whether to preserve hashtags as single tokens. If False, splits into component words."""

    extract_mentions: bool = True
    """Whether to preserve @mentions as single tokens. If False, splits into component words."""

    include_urls: bool = True
    """Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented)."""

    include_emails: bool = True
    """Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented)."""

    # Output formatting
    min_token_length: int = 1
    """Minimum length for tokens to be included in output."""

    max_token_length: Optional[int] = None
    """Maximum length for tokens. If None, no length limit is applied."""

    strip_whitespace: bool = True
    """Whether to strip leading/trailing whitespace from tokens."""
case_handling = CaseHandling.LOWERCASE pydantic-field

How to handle character case during tokenization.

extract_hashtags = True pydantic-field

Whether to preserve hashtags as single tokens. If False, splits into component words.

extract_mentions = True pydantic-field

Whether to preserve @mentions as single tokens. If False, splits into component words.

fallback_language_family = LanguageFamily.MIXED pydantic-field

Default language family when detection fails or mixed content is found.

include_emails = True pydantic-field

Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented).

include_emoji = False pydantic-field

Whether to include emoji characters as tokens.

include_numeric = True pydantic-field

Whether to include numeric tokens (integers, decimals, etc.).

include_punctuation = False pydantic-field

Whether to include punctuation marks as separate tokens.

include_urls = True pydantic-field

Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented).

max_token_length = None pydantic-field

Maximum length for tokens. If None, no length limit is applied.

min_token_length = 1 pydantic-field

Minimum length for tokens to be included in output.

normalize_unicode = True pydantic-field

Whether to apply Unicode NFKC normalization for consistent character representation.

strip_whitespace = True pydantic-field

Whether to strip leading/trailing whitespace from tokens.

services.tokenizer.basic

Basic tokenizer implementation.

This module exports the BasicTokenizer implementation that provides fundamental Unicode-aware tokenization capabilities for social media text.

Modules:

Name Description
patterns

Regex patterns for text tokenization.

tokenizer

BasicTokenizer implementation.

Classes:

Name Description
BasicTokenizer

Unicode-aware basic tokenizer for social media text.

TokenizerConfig

Configuration for tokenizer behavior.

Functions:

Name Description
create_basic_tokenizer

Create a BasicTokenizer with optional configuration.

get_patterns

Get global TokenizerPatterns instance.

tokenize_text

Simple convenience function for basic text tokenization.

BasicTokenizer

Bases: AbstractTokenizer

Unicode-aware basic tokenizer for social media text.

This tokenizer handles mixed-script content, preserves social media entities (@mentions, #hashtags, URLs), and applies appropriate tokenization strategies for different script families.

Methods:

Name Description
__init__

Initialize BasicTokenizer with configuration.

tokenize

Tokenize input text into a list of tokens.

Source code in services/tokenizer/basic/tokenizer.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
class BasicTokenizer(AbstractTokenizer):
    """
    Unicode-aware basic tokenizer for social media text.

    This tokenizer handles mixed-script content, preserves social media entities
    (@mentions, #hashtags, URLs), and applies appropriate tokenization strategies
    for different script families.
    """

    def __init__(self, config: Optional[TokenizerConfig] = None):
        """
        Initialize BasicTokenizer with configuration.

        Args:
            config: Tokenizer configuration. If None, default config will be used.
        """
        super().__init__(config)
        self._patterns = get_patterns()

    def tokenize(self, text: str) -> TokenList:
        """
        Tokenize input text into a list of tokens.

        Applies appropriate tokenization strategies for mixed-script content
        while preserving social media entities and handling Unicode correctly.

        Args:
            text: Input text to tokenize

        Returns:
            List of tokens extracted from the input text in document order
        """
        if not text:
            return []

        # Apply preprocessing
        processed_text = self._preprocess_text(text)
        if not processed_text:
            return []

        # Extract tokens using comprehensive regex pattern
        tokens = self._extract_tokens(processed_text)

        # Apply post-processing
        return self._postprocess_tokens(tokens)

    def _extract_tokens(self, text: str) -> TokenList:
        """
        Extract tokens using comprehensive regex patterns.
        Preserves the original order of tokens as they appear in the input text.

        Args:
            text: Preprocessed text to tokenize

        Returns:
            List of extracted tokens in their original order
        """
        return self._extract_tokens_ordered(text, LanguageFamily.MIXED)

    def _is_char_level_script(self, char: str) -> bool:
        """Check if character belongs to a script that uses character-level tokenization (scriptio continua)."""
        code_point = ord(char)
        return (
            (0x4E00 <= code_point <= 0x9FFF)  # CJK Unified Ideographs
            or (0x3400 <= code_point <= 0x4DBF)  # CJK Extension A
            or (0x3040 <= code_point <= 0x309F)  # Hiragana
            or (0x30A0 <= code_point <= 0x30FF)  # Katakana
            or (0xAC00 <= code_point <= 0xD7AF)  # Hangul Syllables
            or (0x0E00 <= code_point <= 0x0E7F)  # Thai
            or (0x0E80 <= code_point <= 0x0EFF)  # Lao
            or (0x1000 <= code_point <= 0x109F)  # Myanmar
            or (0x1780 <= code_point <= 0x17FF)  # Khmer
        )

    def _get_char_script(self, char: str) -> str:
        """
        Get the script family for a character.

        Args:
            char: Character to analyze

        Returns:
            Script family name
        """
        code_point = ord(char)

        # Latin script
        if (
            (0x0041 <= code_point <= 0x007A)
            or (0x00C0 <= code_point <= 0x024F)
            or (0x1E00 <= code_point <= 0x1EFF)
        ):
            return "latin"

        # Character-level scripts (CJK, Thai, etc.)
        elif self._is_char_level_script(char):
            return "cjk"

        # Arabic script
        elif (
            (0x0600 <= code_point <= 0x06FF)
            or (0x0750 <= code_point <= 0x077F)
            or (0x08A0 <= code_point <= 0x08FF)
        ):
            return "arabic"

        else:
            return "other"

    def _extract_tokens_ordered(
        self, text: str, language_family: LanguageFamily
    ) -> TokenList:
        """
        Extract tokens preserving their original order in the text.

        Uses a single comprehensive regex pattern to find ALL tokens in document order,
        eliminating the need for complex segmentation and reassembly logic.
        This is the Phase 2 optimization that removes O(n×segments) complexity.

        Args:
            text: Preprocessed text to tokenize
            language_family: Detected language family for the full text

        Returns:
            List of extracted tokens in their original order
        """
        if not text.strip():
            return []

        # Remove excluded entities (URLs/emails) from text if they are disabled
        # This prevents them from being tokenized into component words
        exclusion_pattern = self._patterns.get_exclusion_pattern(self._config)
        if exclusion_pattern:
            # Replace excluded entities with spaces to maintain word boundaries
            text = exclusion_pattern.sub(" ", text)
            # Clean up multiple spaces
            text = " ".join(text.split())

        if not text.strip():
            return []

        # Get comprehensive pattern based on configuration
        # This single pattern finds ALL tokens in document order
        comprehensive_pattern = self._patterns.get_comprehensive_pattern(self._config)

        # Single regex call gets all tokens in order - this is the key optimization!
        raw_tokens = comprehensive_pattern.findall(text)

        # If no tokens were found but input has content, use fallback for edge cases
        if not raw_tokens and text.strip():
            # For pure punctuation or unrecognized content, return as single token
            # This maintains compatibility with old tokenizer behavior for edge cases
            return [text.strip()]

        # Apply postprocessing for language-specific behavior and configuration filtering
        tokens = []
        for token in raw_tokens:
            if not token.strip():
                continue

            # Clean URLs by removing trailing punctuation
            if self._is_url_like(token):
                token = self._clean_url_token(token)

            # For character-level scripts, break down multi-character tokens into individual characters
            # This maintains compatibility with existing test expectations
            if (
                language_family == LanguageFamily.CJK
                and self._contains_char_level_chars(token)
            ):
                # Only break down pure character-level tokens, not mixed tokens
                if self._is_pure_char_level_token(token):
                    tokens.extend(list(token))
                else:
                    # Mixed token - keep as is but process character-level parts
                    tokens.append(token)
            elif language_family == LanguageFamily.MIXED:
                # For mixed script, break down character-level script parts but keep Latin parts whole
                processed_tokens = self._process_mixed_script_token(token)
                tokens.extend(processed_tokens)
            else:
                tokens.append(token)

        return [token for token in tokens if token.strip()]

    def _is_punctuation_only(self, token: str) -> bool:
        """Check if token contains only punctuation."""
        punctuation_chars = ".!?;:,()[]{}\"'-~`@#$%^&*+=<>/|\\"
        return all(c in punctuation_chars for c in token)

    def _is_numeric_only(self, token: str) -> bool:
        """Check if token is purely numeric."""
        return (
            token.replace(".", "")
            .replace(",", "")
            .replace("%", "")
            .replace("$", "")
            .isdigit()
        )

    def _is_url_like(self, token: str) -> bool:
        """Check if token looks like a URL."""
        # Don't classify emails as URLs
        if self._is_email_like(token):
            return False

        # Explicit URL indicators (http://, https://, www., or protocol markers)
        if token.startswith(("http://", "https://", "www.")) or "://" in token:
            return True

        # Domain-like patterns (e.g., "example.com")
        # But NOT abbreviations (e.g., "U.S.", "c.e.o.s")
        # Heuristic: URLs have at least one period NOT followed by a single uppercase/lowercase letter
        # This allows "example.com" but excludes "U.S." and "c.e.o.s"
        if (
            token.count(".") >= 1
            and any(c.isalpha() for c in token)
            and "@" not in token
        ):
            # Check if this looks like an abbreviation (single letters between periods)
            # Pattern: letter(s).letter(s).letter(s) where segments are 1-3 chars
            abbreviation_pattern = r"^[a-z]{1,3}(?:\.[a-z]{1,3})+\.?$"

            if re.match(abbreviation_pattern, token, re.IGNORECASE):
                return False  # This is an abbreviation, not a URL
            # If it has a period and looks like a domain, it's URL-like
            return True

        return False

    def _is_email_like(self, token: str) -> bool:
        """Check if token looks like an email address."""
        return "@" in token and "." in token and not token.startswith("@")

    def _clean_url_token(self, url_token: str) -> str:
        """Remove trailing punctuation from URL tokens."""
        trailing_punctuation = ".!?;:,)]}\"'"
        return url_token.rstrip(trailing_punctuation)

    def _contains_char_level_chars(self, token: str) -> bool:
        """Check if token contains any character-level script characters."""
        return any(self._is_char_level_script(char) for char in token)

    def _is_pure_char_level_token(self, token: str) -> bool:
        """Check if token contains only character-level script characters."""
        return all(self._is_char_level_script(char) or char.isspace() for char in token)

    def _process_mixed_script_token(self, token: str) -> TokenList:
        """Process mixed script tokens by breaking down character-level script parts."""
        if not self._contains_char_level_chars(token):
            return [token]

        result = []
        current_token = ""
        current_is_cjk = None

        for char in token:
            char_is_cjk = self._is_char_level_script(char)

            if current_is_cjk is None:
                current_is_cjk = char_is_cjk
                current_token = char
            elif char_is_cjk == current_is_cjk:
                current_token += char
            else:
                # Script change
                if current_token.strip():
                    if current_is_cjk and len(current_token) > 1:
                        # Break CJK into individual characters
                        result.extend(list(current_token))
                    else:
                        result.append(current_token)
                current_token = char
                current_is_cjk = char_is_cjk

        # Handle final token
        if current_token.strip():
            if current_is_cjk and len(current_token) > 1:
                result.extend(list(current_token))
            else:
                result.append(current_token)

        return result

    def _postprocess_tokens(self, tokens: TokenList) -> TokenList:
        """
        Apply post-processing to extracted tokens.

        Args:
            tokens: List of raw tokens

        Returns:
            Processed token list
        """
        if not tokens:
            return tokens

        # Apply base class post-processing (length filtering, whitespace stripping, etc.)
        return super()._postprocess_tokens(tokens)

__init__(config=None)

Initialize BasicTokenizer with configuration.

Parameters:

Name Type Description Default
config
Optional[TokenizerConfig]

Tokenizer configuration. If None, default config will be used.

None
Source code in services/tokenizer/basic/tokenizer.py
25
26
27
28
29
30
31
32
33
def __init__(self, config: Optional[TokenizerConfig] = None):
    """
    Initialize BasicTokenizer with configuration.

    Args:
        config: Tokenizer configuration. If None, default config will be used.
    """
    super().__init__(config)
    self._patterns = get_patterns()

tokenize(text)

Tokenize input text into a list of tokens.

Applies appropriate tokenization strategies for mixed-script content while preserving social media entities and handling Unicode correctly.

Parameters:

Name Type Description Default
text
str

Input text to tokenize

required

Returns:

Type Description
TokenList

List of tokens extracted from the input text in document order

Source code in services/tokenizer/basic/tokenizer.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def tokenize(self, text: str) -> TokenList:
    """
    Tokenize input text into a list of tokens.

    Applies appropriate tokenization strategies for mixed-script content
    while preserving social media entities and handling Unicode correctly.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens extracted from the input text in document order
    """
    if not text:
        return []

    # Apply preprocessing
    processed_text = self._preprocess_text(text)
    if not processed_text:
        return []

    # Extract tokens using comprehensive regex pattern
    tokens = self._extract_tokens(processed_text)

    # Apply post-processing
    return self._postprocess_tokens(tokens)

TokenizerConfig pydantic-model

Bases: BaseModel

Configuration for tokenizer behavior.

Controls all aspects of text tokenization including script handling, social media entity processing, and output formatting.

Social Media Entity Behavior: - extract_hashtags/extract_mentions: When False, splits into component words - include_urls/include_emails: When False, completely excludes (no fragmentation)

Fields:

Source code in services/tokenizer/core/types.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
class TokenizerConfig(BaseModel):
    """Configuration for tokenizer behavior.

    Controls all aspects of text tokenization including script handling,
    social media entity processing, and output formatting.

    Social Media Entity Behavior:
    - extract_hashtags/extract_mentions: When False, splits into component words
    - include_urls/include_emails: When False, completely excludes (no fragmentation)
    """

    # Language detection settings
    fallback_language_family: LanguageFamily = LanguageFamily.MIXED
    """Default language family when detection fails or mixed content is found."""

    # Token type filtering
    include_punctuation: bool = False
    """Whether to include punctuation marks as separate tokens."""

    include_numeric: bool = True
    """Whether to include numeric tokens (integers, decimals, etc.)."""

    include_emoji: bool = False
    """Whether to include emoji characters as tokens."""

    # Text preprocessing
    case_handling: CaseHandling = CaseHandling.LOWERCASE
    """How to handle character case during tokenization."""

    normalize_unicode: bool = True
    """Whether to apply Unicode NFKC normalization for consistent character representation."""

    # Social media features
    extract_hashtags: bool = True
    """Whether to preserve hashtags as single tokens. If False, splits into component words."""

    extract_mentions: bool = True
    """Whether to preserve @mentions as single tokens. If False, splits into component words."""

    include_urls: bool = True
    """Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented)."""

    include_emails: bool = True
    """Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented)."""

    # Output formatting
    min_token_length: int = 1
    """Minimum length for tokens to be included in output."""

    max_token_length: Optional[int] = None
    """Maximum length for tokens. If None, no length limit is applied."""

    strip_whitespace: bool = True
    """Whether to strip leading/trailing whitespace from tokens."""

case_handling = CaseHandling.LOWERCASE pydantic-field

How to handle character case during tokenization.

extract_hashtags = True pydantic-field

Whether to preserve hashtags as single tokens. If False, splits into component words.

extract_mentions = True pydantic-field

Whether to preserve @mentions as single tokens. If False, splits into component words.

fallback_language_family = LanguageFamily.MIXED pydantic-field

Default language family when detection fails or mixed content is found.

include_emails = True pydantic-field

Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented).

include_emoji = False pydantic-field

Whether to include emoji characters as tokens.

include_numeric = True pydantic-field

Whether to include numeric tokens (integers, decimals, etc.).

include_punctuation = False pydantic-field

Whether to include punctuation marks as separate tokens.

include_urls = True pydantic-field

Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented).

max_token_length = None pydantic-field

Maximum length for tokens. If None, no length limit is applied.

min_token_length = 1 pydantic-field

Minimum length for tokens to be included in output.

normalize_unicode = True pydantic-field

Whether to apply Unicode NFKC normalization for consistent character representation.

strip_whitespace = True pydantic-field

Whether to strip leading/trailing whitespace from tokens.

create_basic_tokenizer(config=None)

Create a BasicTokenizer with optional configuration.

Source code in services/tokenizer/basic/__init__.py
14
15
16
17
18
def create_basic_tokenizer(config: TokenizerConfig | None = None) -> BasicTokenizer:
    """Create a BasicTokenizer with optional configuration."""
    if config is None:
        config = TokenizerConfig()
    return BasicTokenizer(config)

get_patterns()

Get global TokenizerPatterns instance.

Returns:

Type Description
TokenizerPatterns

Singleton TokenizerPatterns instance

Source code in services/tokenizer/basic/patterns.py
316
317
318
319
320
321
322
323
324
325
326
def get_patterns() -> TokenizerPatterns:
    """
    Get global TokenizerPatterns instance.

    Returns:
        Singleton TokenizerPatterns instance
    """
    global _global_patterns
    if _global_patterns is None:
        _global_patterns = TokenizerPatterns()
    return _global_patterns

patterns

Regex patterns for text tokenization.

This module contains compiled regular expressions for extracting different types of tokens from social media text, with fallback support for both regex and re modules.

Classes:

Name Description
TokenizerPatterns

Compiled regex patterns for tokenization.

Functions:

Name Description
get_patterns

Get global TokenizerPatterns instance.

TokenizerPatterns

Compiled regex patterns for tokenization.

Organizes patterns logically and provides efficient compiled regex objects for different token types found in social media text.

Methods:

Name Description
__init__

Initialize and compile all tokenization patterns.

get_comprehensive_pattern

Build comprehensive tokenization pattern based on configuration.

get_exclusion_pattern

Build pattern to identify and skip excluded entities in text.

get_pattern

Get compiled pattern by name.

list_patterns

Get list of available pattern names.

Source code in services/tokenizer/basic/patterns.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
class TokenizerPatterns:
    """
    Compiled regex patterns for tokenization.

    Organizes patterns logically and provides efficient compiled regex objects
    for different token types found in social media text.
    """

    def __init__(self):
        """Initialize and compile all tokenization patterns."""
        self._patterns: Dict[str, Any] = {}
        self._compile_patterns()

    def get_pattern(self, pattern_name: str) -> Any:
        """
        Get compiled pattern by name.

        Args:
            pattern_name: Name of the pattern to retrieve

        Returns:
            Compiled regex pattern

        Raises:
            KeyError: If pattern name is not found
        """
        if pattern_name not in self._patterns:
            raise KeyError(f"Pattern '{pattern_name}' not found")
        return self._patterns[pattern_name]

    def get_comprehensive_pattern(self, config) -> Any:
        """
        Build comprehensive tokenization pattern based on configuration.

        This creates a single regex pattern that finds ALL tokens in document order,
        eliminating the need for segmentation and reassembly. URLs and emails are
        conditionally included in the regex itself based on configuration, avoiding
        the need for post-processing filtering.

        Args:
            config: TokenizerConfig specifying which token types to include

        Returns:
            Compiled regex pattern that matches all desired token types in priority order
        """
        pattern_parts = []

        # Conditionally add URL and email patterns based on configuration
        # This eliminates the need for post-processing filtering
        if config.include_urls:
            pattern_parts.append(self.get_pattern("url").pattern)

        if config.include_emails:
            pattern_parts.append(self.get_pattern("email").pattern)

        if config.extract_mentions:
            pattern_parts.append(self.get_pattern("mention").pattern)

        if config.extract_hashtags:
            pattern_parts.append(self.get_pattern("hashtag").pattern)

        if config.include_emoji:
            pattern_parts.append(self.get_pattern("emoji").pattern)

        if config.include_numeric:
            pattern_parts.append(self.get_pattern("numeric").pattern)

        # Always include word pattern (this is the core tokenization)
        pattern_parts.append(self.get_pattern("word").pattern)

        if config.include_punctuation:
            pattern_parts.append(self.get_pattern("punctuation").pattern)

        # Don't add the greedy fallback - let configuration control what gets captured

        # Combine patterns with alternation (| operator)
        comprehensive_pattern = "(?:" + "|".join(pattern_parts) + ")"

        try:
            return REGEX_MODULE.compile(comprehensive_pattern, REGEX_MODULE.IGNORECASE)
        except Exception:
            # Fallback to standard re module
            if REGEX_AVAILABLE and REGEX_MODULE is not re:
                try:
                    return re.compile(comprehensive_pattern, re.IGNORECASE)
                except Exception:
                    # Ultimate fallback - just match words
                    return re.compile(r"\S+", re.IGNORECASE)
            else:
                return re.compile(r"\S+", re.IGNORECASE)

    def get_exclusion_pattern(self, config) -> Any:
        """
        Build pattern to identify and skip excluded entities in text.

        This creates a pattern that matches URLs and emails that should be excluded,
        allowing the tokenizer to skip over them entirely instead of breaking them
        into component words.

        Args:
            config: TokenizerConfig specifying which token types to exclude

        Returns:
            Compiled regex pattern that matches excluded entities, or None if no exclusions
        """
        exclusion_parts = []

        if not config.include_urls:
            exclusion_parts.append(self.get_pattern("url").pattern)

        if not config.include_emails:
            exclusion_parts.append(self.get_pattern("email").pattern)

        if not config.include_numeric:
            exclusion_parts.append(self.get_pattern("numeric").pattern)

        if not exclusion_parts:
            return None

        # Combine exclusion patterns
        exclusion_pattern = "(?:" + "|".join(exclusion_parts) + ")"

        try:
            return REGEX_MODULE.compile(exclusion_pattern, REGEX_MODULE.IGNORECASE)
        except Exception:
            # Fallback to standard re module
            if REGEX_AVAILABLE and REGEX_MODULE is not re:
                try:
                    return re.compile(exclusion_pattern, re.IGNORECASE)
                except Exception:
                    return None
            else:
                return None

    def list_patterns(self) -> List[str]:
        """Get list of available pattern names."""
        return list(self._patterns.keys())

    def _compile_patterns(self):
        """Compile all regex patterns with fallback support."""

        # Compile patterns with fallback handling
        patterns_to_compile = {
            "url": URL_PATTERN,
            "email": EMAIL_PATTERN,
            "mention": MENTION_PATTERN,
            "hashtag": HASHTAG_PATTERN,
            "emoji": EMOJI_PATTERN,
            "numeric": NUMERIC_PATTERN,
            "word": WORD_PATTERN,
            "latin_word": LATIN_WORD_PATTERN,
            "cjk_chars": CJK_PATTERN,
            "arabic_chars": ARABIC_PATTERN,
            "punctuation": PUNCTUATION_PATTERN,
            "social_media": SOCIAL_MEDIA_PATTERN,
            "word_boundary": WORD_BOUNDARY_PATTERN,
            "combined_social_entities": COMBINED_SOCIAL_ENTITIES_PATTERN,
        }

        for name, pattern in patterns_to_compile.items():
            try:
                self._patterns[name] = REGEX_MODULE.compile(
                    pattern, REGEX_MODULE.IGNORECASE
                )
            except Exception:
                # If compilation fails with regex module, fall back to re
                if REGEX_AVAILABLE and REGEX_MODULE is not re:
                    try:
                        self._patterns[name] = re.compile(pattern, re.IGNORECASE)
                    except Exception:
                        # If both fail, create a simple fallback
                        self._patterns[name] = re.compile(r"\S+", re.IGNORECASE)
                else:
                    # Already using re module, create simple fallback
                    self._patterns[name] = re.compile(r"\S+", re.IGNORECASE)
__init__()

Initialize and compile all tokenization patterns.

Source code in services/tokenizer/basic/patterns.py
143
144
145
146
def __init__(self):
    """Initialize and compile all tokenization patterns."""
    self._patterns: Dict[str, Any] = {}
    self._compile_patterns()
get_comprehensive_pattern(config)

Build comprehensive tokenization pattern based on configuration.

This creates a single regex pattern that finds ALL tokens in document order, eliminating the need for segmentation and reassembly. URLs and emails are conditionally included in the regex itself based on configuration, avoiding the need for post-processing filtering.

Parameters:

Name Type Description Default
config

TokenizerConfig specifying which token types to include

required

Returns:

Type Description
Any

Compiled regex pattern that matches all desired token types in priority order

Source code in services/tokenizer/basic/patterns.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def get_comprehensive_pattern(self, config) -> Any:
    """
    Build comprehensive tokenization pattern based on configuration.

    This creates a single regex pattern that finds ALL tokens in document order,
    eliminating the need for segmentation and reassembly. URLs and emails are
    conditionally included in the regex itself based on configuration, avoiding
    the need for post-processing filtering.

    Args:
        config: TokenizerConfig specifying which token types to include

    Returns:
        Compiled regex pattern that matches all desired token types in priority order
    """
    pattern_parts = []

    # Conditionally add URL and email patterns based on configuration
    # This eliminates the need for post-processing filtering
    if config.include_urls:
        pattern_parts.append(self.get_pattern("url").pattern)

    if config.include_emails:
        pattern_parts.append(self.get_pattern("email").pattern)

    if config.extract_mentions:
        pattern_parts.append(self.get_pattern("mention").pattern)

    if config.extract_hashtags:
        pattern_parts.append(self.get_pattern("hashtag").pattern)

    if config.include_emoji:
        pattern_parts.append(self.get_pattern("emoji").pattern)

    if config.include_numeric:
        pattern_parts.append(self.get_pattern("numeric").pattern)

    # Always include word pattern (this is the core tokenization)
    pattern_parts.append(self.get_pattern("word").pattern)

    if config.include_punctuation:
        pattern_parts.append(self.get_pattern("punctuation").pattern)

    # Don't add the greedy fallback - let configuration control what gets captured

    # Combine patterns with alternation (| operator)
    comprehensive_pattern = "(?:" + "|".join(pattern_parts) + ")"

    try:
        return REGEX_MODULE.compile(comprehensive_pattern, REGEX_MODULE.IGNORECASE)
    except Exception:
        # Fallback to standard re module
        if REGEX_AVAILABLE and REGEX_MODULE is not re:
            try:
                return re.compile(comprehensive_pattern, re.IGNORECASE)
            except Exception:
                # Ultimate fallback - just match words
                return re.compile(r"\S+", re.IGNORECASE)
        else:
            return re.compile(r"\S+", re.IGNORECASE)
get_exclusion_pattern(config)

Build pattern to identify and skip excluded entities in text.

This creates a pattern that matches URLs and emails that should be excluded, allowing the tokenizer to skip over them entirely instead of breaking them into component words.

Parameters:

Name Type Description Default
config

TokenizerConfig specifying which token types to exclude

required

Returns:

Type Description
Any

Compiled regex pattern that matches excluded entities, or None if no exclusions

Source code in services/tokenizer/basic/patterns.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
def get_exclusion_pattern(self, config) -> Any:
    """
    Build pattern to identify and skip excluded entities in text.

    This creates a pattern that matches URLs and emails that should be excluded,
    allowing the tokenizer to skip over them entirely instead of breaking them
    into component words.

    Args:
        config: TokenizerConfig specifying which token types to exclude

    Returns:
        Compiled regex pattern that matches excluded entities, or None if no exclusions
    """
    exclusion_parts = []

    if not config.include_urls:
        exclusion_parts.append(self.get_pattern("url").pattern)

    if not config.include_emails:
        exclusion_parts.append(self.get_pattern("email").pattern)

    if not config.include_numeric:
        exclusion_parts.append(self.get_pattern("numeric").pattern)

    if not exclusion_parts:
        return None

    # Combine exclusion patterns
    exclusion_pattern = "(?:" + "|".join(exclusion_parts) + ")"

    try:
        return REGEX_MODULE.compile(exclusion_pattern, REGEX_MODULE.IGNORECASE)
    except Exception:
        # Fallback to standard re module
        if REGEX_AVAILABLE and REGEX_MODULE is not re:
            try:
                return re.compile(exclusion_pattern, re.IGNORECASE)
            except Exception:
                return None
        else:
            return None
get_pattern(pattern_name)

Get compiled pattern by name.

Parameters:

Name Type Description Default
pattern_name
str

Name of the pattern to retrieve

required

Returns:

Type Description
Any

Compiled regex pattern

Raises:

Type Description
KeyError

If pattern name is not found

Source code in services/tokenizer/basic/patterns.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def get_pattern(self, pattern_name: str) -> Any:
    """
    Get compiled pattern by name.

    Args:
        pattern_name: Name of the pattern to retrieve

    Returns:
        Compiled regex pattern

    Raises:
        KeyError: If pattern name is not found
    """
    if pattern_name not in self._patterns:
        raise KeyError(f"Pattern '{pattern_name}' not found")
    return self._patterns[pattern_name]
list_patterns()

Get list of available pattern names.

Source code in services/tokenizer/basic/patterns.py
269
270
271
def list_patterns(self) -> List[str]:
    """Get list of available pattern names."""
    return list(self._patterns.keys())

get_patterns()

Get global TokenizerPatterns instance.

Returns:

Type Description
TokenizerPatterns

Singleton TokenizerPatterns instance

Source code in services/tokenizer/basic/patterns.py
316
317
318
319
320
321
322
323
324
325
326
def get_patterns() -> TokenizerPatterns:
    """
    Get global TokenizerPatterns instance.

    Returns:
        Singleton TokenizerPatterns instance
    """
    global _global_patterns
    if _global_patterns is None:
        _global_patterns = TokenizerPatterns()
    return _global_patterns

tokenize_text(text, config=None)

Simple convenience function for basic text tokenization.

Source code in services/tokenizer/basic/__init__.py
21
22
23
24
def tokenize_text(text: str, config: TokenizerConfig | None = None) -> list[str]:
    """Simple convenience function for basic text tokenization."""
    tokenizer = create_basic_tokenizer(config)
    return tokenizer.tokenize(text)

tokenizer

BasicTokenizer implementation.

This module contains the main BasicTokenizer class that implements Unicode-aware tokenization for social media text with entity preservation.

Classes:

Name Description
BasicTokenizer

Unicode-aware basic tokenizer for social media text.

BasicTokenizer

Bases: AbstractTokenizer

Unicode-aware basic tokenizer for social media text.

This tokenizer handles mixed-script content, preserves social media entities (@mentions, #hashtags, URLs), and applies appropriate tokenization strategies for different script families.

Methods:

Name Description
__init__

Initialize BasicTokenizer with configuration.

tokenize

Tokenize input text into a list of tokens.

Source code in services/tokenizer/basic/tokenizer.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
class BasicTokenizer(AbstractTokenizer):
    """
    Unicode-aware basic tokenizer for social media text.

    This tokenizer handles mixed-script content, preserves social media entities
    (@mentions, #hashtags, URLs), and applies appropriate tokenization strategies
    for different script families.
    """

    def __init__(self, config: Optional[TokenizerConfig] = None):
        """
        Initialize BasicTokenizer with configuration.

        Args:
            config: Tokenizer configuration. If None, default config will be used.
        """
        super().__init__(config)
        self._patterns = get_patterns()

    def tokenize(self, text: str) -> TokenList:
        """
        Tokenize input text into a list of tokens.

        Applies appropriate tokenization strategies for mixed-script content
        while preserving social media entities and handling Unicode correctly.

        Args:
            text: Input text to tokenize

        Returns:
            List of tokens extracted from the input text in document order
        """
        if not text:
            return []

        # Apply preprocessing
        processed_text = self._preprocess_text(text)
        if not processed_text:
            return []

        # Extract tokens using comprehensive regex pattern
        tokens = self._extract_tokens(processed_text)

        # Apply post-processing
        return self._postprocess_tokens(tokens)

    def _extract_tokens(self, text: str) -> TokenList:
        """
        Extract tokens using comprehensive regex patterns.
        Preserves the original order of tokens as they appear in the input text.

        Args:
            text: Preprocessed text to tokenize

        Returns:
            List of extracted tokens in their original order
        """
        return self._extract_tokens_ordered(text, LanguageFamily.MIXED)

    def _is_char_level_script(self, char: str) -> bool:
        """Check if character belongs to a script that uses character-level tokenization (scriptio continua)."""
        code_point = ord(char)
        return (
            (0x4E00 <= code_point <= 0x9FFF)  # CJK Unified Ideographs
            or (0x3400 <= code_point <= 0x4DBF)  # CJK Extension A
            or (0x3040 <= code_point <= 0x309F)  # Hiragana
            or (0x30A0 <= code_point <= 0x30FF)  # Katakana
            or (0xAC00 <= code_point <= 0xD7AF)  # Hangul Syllables
            or (0x0E00 <= code_point <= 0x0E7F)  # Thai
            or (0x0E80 <= code_point <= 0x0EFF)  # Lao
            or (0x1000 <= code_point <= 0x109F)  # Myanmar
            or (0x1780 <= code_point <= 0x17FF)  # Khmer
        )

    def _get_char_script(self, char: str) -> str:
        """
        Get the script family for a character.

        Args:
            char: Character to analyze

        Returns:
            Script family name
        """
        code_point = ord(char)

        # Latin script
        if (
            (0x0041 <= code_point <= 0x007A)
            or (0x00C0 <= code_point <= 0x024F)
            or (0x1E00 <= code_point <= 0x1EFF)
        ):
            return "latin"

        # Character-level scripts (CJK, Thai, etc.)
        elif self._is_char_level_script(char):
            return "cjk"

        # Arabic script
        elif (
            (0x0600 <= code_point <= 0x06FF)
            or (0x0750 <= code_point <= 0x077F)
            or (0x08A0 <= code_point <= 0x08FF)
        ):
            return "arabic"

        else:
            return "other"

    def _extract_tokens_ordered(
        self, text: str, language_family: LanguageFamily
    ) -> TokenList:
        """
        Extract tokens preserving their original order in the text.

        Uses a single comprehensive regex pattern to find ALL tokens in document order,
        eliminating the need for complex segmentation and reassembly logic.
        This is the Phase 2 optimization that removes O(n×segments) complexity.

        Args:
            text: Preprocessed text to tokenize
            language_family: Detected language family for the full text

        Returns:
            List of extracted tokens in their original order
        """
        if not text.strip():
            return []

        # Remove excluded entities (URLs/emails) from text if they are disabled
        # This prevents them from being tokenized into component words
        exclusion_pattern = self._patterns.get_exclusion_pattern(self._config)
        if exclusion_pattern:
            # Replace excluded entities with spaces to maintain word boundaries
            text = exclusion_pattern.sub(" ", text)
            # Clean up multiple spaces
            text = " ".join(text.split())

        if not text.strip():
            return []

        # Get comprehensive pattern based on configuration
        # This single pattern finds ALL tokens in document order
        comprehensive_pattern = self._patterns.get_comprehensive_pattern(self._config)

        # Single regex call gets all tokens in order - this is the key optimization!
        raw_tokens = comprehensive_pattern.findall(text)

        # If no tokens were found but input has content, use fallback for edge cases
        if not raw_tokens and text.strip():
            # For pure punctuation or unrecognized content, return as single token
            # This maintains compatibility with old tokenizer behavior for edge cases
            return [text.strip()]

        # Apply postprocessing for language-specific behavior and configuration filtering
        tokens = []
        for token in raw_tokens:
            if not token.strip():
                continue

            # Clean URLs by removing trailing punctuation
            if self._is_url_like(token):
                token = self._clean_url_token(token)

            # For character-level scripts, break down multi-character tokens into individual characters
            # This maintains compatibility with existing test expectations
            if (
                language_family == LanguageFamily.CJK
                and self._contains_char_level_chars(token)
            ):
                # Only break down pure character-level tokens, not mixed tokens
                if self._is_pure_char_level_token(token):
                    tokens.extend(list(token))
                else:
                    # Mixed token - keep as is but process character-level parts
                    tokens.append(token)
            elif language_family == LanguageFamily.MIXED:
                # For mixed script, break down character-level script parts but keep Latin parts whole
                processed_tokens = self._process_mixed_script_token(token)
                tokens.extend(processed_tokens)
            else:
                tokens.append(token)

        return [token for token in tokens if token.strip()]

    def _is_punctuation_only(self, token: str) -> bool:
        """Check if token contains only punctuation."""
        punctuation_chars = ".!?;:,()[]{}\"'-~`@#$%^&*+=<>/|\\"
        return all(c in punctuation_chars for c in token)

    def _is_numeric_only(self, token: str) -> bool:
        """Check if token is purely numeric."""
        return (
            token.replace(".", "")
            .replace(",", "")
            .replace("%", "")
            .replace("$", "")
            .isdigit()
        )

    def _is_url_like(self, token: str) -> bool:
        """Check if token looks like a URL."""
        # Don't classify emails as URLs
        if self._is_email_like(token):
            return False

        # Explicit URL indicators (http://, https://, www., or protocol markers)
        if token.startswith(("http://", "https://", "www.")) or "://" in token:
            return True

        # Domain-like patterns (e.g., "example.com")
        # But NOT abbreviations (e.g., "U.S.", "c.e.o.s")
        # Heuristic: URLs have at least one period NOT followed by a single uppercase/lowercase letter
        # This allows "example.com" but excludes "U.S." and "c.e.o.s"
        if (
            token.count(".") >= 1
            and any(c.isalpha() for c in token)
            and "@" not in token
        ):
            # Check if this looks like an abbreviation (single letters between periods)
            # Pattern: letter(s).letter(s).letter(s) where segments are 1-3 chars
            abbreviation_pattern = r"^[a-z]{1,3}(?:\.[a-z]{1,3})+\.?$"

            if re.match(abbreviation_pattern, token, re.IGNORECASE):
                return False  # This is an abbreviation, not a URL
            # If it has a period and looks like a domain, it's URL-like
            return True

        return False

    def _is_email_like(self, token: str) -> bool:
        """Check if token looks like an email address."""
        return "@" in token and "." in token and not token.startswith("@")

    def _clean_url_token(self, url_token: str) -> str:
        """Remove trailing punctuation from URL tokens."""
        trailing_punctuation = ".!?;:,)]}\"'"
        return url_token.rstrip(trailing_punctuation)

    def _contains_char_level_chars(self, token: str) -> bool:
        """Check if token contains any character-level script characters."""
        return any(self._is_char_level_script(char) for char in token)

    def _is_pure_char_level_token(self, token: str) -> bool:
        """Check if token contains only character-level script characters."""
        return all(self._is_char_level_script(char) or char.isspace() for char in token)

    def _process_mixed_script_token(self, token: str) -> TokenList:
        """Process mixed script tokens by breaking down character-level script parts."""
        if not self._contains_char_level_chars(token):
            return [token]

        result = []
        current_token = ""
        current_is_cjk = None

        for char in token:
            char_is_cjk = self._is_char_level_script(char)

            if current_is_cjk is None:
                current_is_cjk = char_is_cjk
                current_token = char
            elif char_is_cjk == current_is_cjk:
                current_token += char
            else:
                # Script change
                if current_token.strip():
                    if current_is_cjk and len(current_token) > 1:
                        # Break CJK into individual characters
                        result.extend(list(current_token))
                    else:
                        result.append(current_token)
                current_token = char
                current_is_cjk = char_is_cjk

        # Handle final token
        if current_token.strip():
            if current_is_cjk and len(current_token) > 1:
                result.extend(list(current_token))
            else:
                result.append(current_token)

        return result

    def _postprocess_tokens(self, tokens: TokenList) -> TokenList:
        """
        Apply post-processing to extracted tokens.

        Args:
            tokens: List of raw tokens

        Returns:
            Processed token list
        """
        if not tokens:
            return tokens

        # Apply base class post-processing (length filtering, whitespace stripping, etc.)
        return super()._postprocess_tokens(tokens)
__init__(config=None)

Initialize BasicTokenizer with configuration.

Parameters:

Name Type Description Default
config
Optional[TokenizerConfig]

Tokenizer configuration. If None, default config will be used.

None
Source code in services/tokenizer/basic/tokenizer.py
25
26
27
28
29
30
31
32
33
def __init__(self, config: Optional[TokenizerConfig] = None):
    """
    Initialize BasicTokenizer with configuration.

    Args:
        config: Tokenizer configuration. If None, default config will be used.
    """
    super().__init__(config)
    self._patterns = get_patterns()
tokenize(text)

Tokenize input text into a list of tokens.

Applies appropriate tokenization strategies for mixed-script content while preserving social media entities and handling Unicode correctly.

Parameters:

Name Type Description Default
text
str

Input text to tokenize

required

Returns:

Type Description
TokenList

List of tokens extracted from the input text in document order

Source code in services/tokenizer/basic/tokenizer.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def tokenize(self, text: str) -> TokenList:
    """
    Tokenize input text into a list of tokens.

    Applies appropriate tokenization strategies for mixed-script content
    while preserving social media entities and handling Unicode correctly.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens extracted from the input text in document order
    """
    if not text:
        return []

    # Apply preprocessing
    processed_text = self._preprocess_text(text)
    if not processed_text:
        return []

    # Extract tokens using comprehensive regex pattern
    tokens = self._extract_tokens(processed_text)

    # Apply post-processing
    return self._postprocess_tokens(tokens)