Tokenizer

`services.tokenizer.core`

Core tokenizer components.

Exports: - AbstractTokenizer - TokenizerConfig - TokenList - LanguageFamily - TokenType - CaseHandling

Intended usage

from services.tokenizer.core import ( AbstractTokenizer, TokenizerConfig, TokenList, LanguageFamily, TokenType, CaseHandling, )

Modules:

Name	Description
`base`	AbstractTokenizer abstract base class
`types`	TokenizerConfig, enums, and shared types

Classes:

Name	Description
`AbstractTokenizer`	Abstract base class for all tokenizer implementations.
`CaseHandling`	How to handle character case during tokenization.
`LanguageFamily`	Language families that affect tokenization strategies.
`TokenType`	Types of tokens that can be extracted.
`TokenizerConfig`	Configuration for tokenizer behavior.

`AbstractTokenizer`

Bases: ABC

Abstract base class for all tokenizer implementations.

This class defines the core interface that all tokenizer plugins must implement. It provides a clean contract for tokenization operations while allowing for different implementation strategies.

Methods:

Name	Description
`__init__`	Initialize the tokenizer with configuration.
`tokenize`	Tokenize input text into a list of tokens.

Attributes:

Name	Type	Description
`config`	`TokenizerConfig`	Get the current tokenizer configuration.

Source code in services/tokenizer/core/base.py

class AbstractTokenizer(ABC):
    """
    Abstract base class for all tokenizer implementations.

    This class defines the core interface that all tokenizer plugins must implement.
    It provides a clean contract for tokenization operations while allowing for
    different implementation strategies.
    """

    def __init__(self, config: Optional[TokenizerConfig] = None):
        """
        Initialize the tokenizer with configuration.

        Args:
            config: Tokenizer configuration. If None, default config will be used.
        """
        self._config = config or TokenizerConfig()

    @property
    def config(self) -> TokenizerConfig:
        """Get the current tokenizer configuration."""
        return self._config

    @abstractmethod
    def tokenize(self, text: str) -> TokenList:
        """
        Tokenize input text into a list of tokens.

        This is the main tokenization method that all implementations must provide.

        Args:
            text: Input text to tokenize

        Returns:
            List of tokens extracted from the input text
        """
        pass

    def _preprocess_text(self, text: str) -> str:
        """
        Apply preprocessing to text before tokenization.

        This method applies configuration-based preprocessing such as
        case handling and Unicode normalization.

        Args:
            text: Input text to preprocess

        Returns:
            Preprocessed text
        """
        if not text:
            return text

        # Apply Unicode normalization
        if self._config.normalize_unicode:
            import unicodedata

            text = unicodedata.normalize("NFKC", text)

        # Apply case handling
        from .types import CaseHandling

        if self._config.case_handling == CaseHandling.LOWERCASE:
            text = text.lower()
        elif self._config.case_handling == CaseHandling.UPPERCASE:
            text = text.upper()
        elif self._config.case_handling == CaseHandling.NORMALIZE:
            # TODO: Implement proper noun detection for smart normalization
            # Currently using simple lowercase as a placeholder
            text = text.lower()

        return text

    def _postprocess_tokens(self, tokens: TokenList) -> TokenList:
        """
        Apply post-processing to extracted tokens.

        This method applies configuration-based filtering and cleanup
        to the token list.

        Args:
            tokens: List of raw tokens

        Returns:
            Processed token list
        """
        if not tokens:
            return tokens

        processed_tokens = []

        for token in tokens:
            # Strip whitespace if configured
            if self._config.strip_whitespace:
                token = token.strip()

            # Skip empty tokens
            if not token:
                continue

            # Filter emojis if not included
            if not self._config.include_emoji and self._is_emoji(token):
                continue

            # Apply length filtering
            if len(token) < self._config.min_token_length:
                continue

            if (
                self._config.max_token_length is not None
                and len(token) > self._config.max_token_length
            ):
                continue

            processed_tokens.append(token)

        return processed_tokens

    @staticmethod
    def _is_emoji(token: str) -> bool:
        """
        Check if a token is an emoji character.

        Args:
            token: Token to check

        Returns:
            True if the token is an emoji, False otherwise
        """
        if not token:
            return False

        # Accept sequences made of emoji code points plus common modifiers
        EMOJI_RANGES = (
            (0x1F600, 0x1F64F),  # Emoticons
            (0x1F300, 0x1F5FF),  # Misc Symbols & Pictographs
            (0x1F680, 0x1F6FF),  # Transport & Map
            (0x1F1E6, 0x1F1FF),  # Regional Indicators
            (0x2600, 0x26FF),  # Misc symbols
            (0x2700, 0x27BF),  # Dingbats
            (0x1F900, 0x1F9FF),  # Supplemental Symbols & Pictographs
            (0x1FA70, 0x1FAFF),  # Symbols & Pictographs Extended-A
        )
        MODIFIERS = {0x200D, 0xFE0E, 0xFE0F}  # ZWJ, VS15, VS16
        SKIN_TONE = (0x1F3FB, 0x1F3FF)
        TAGS = (0xE0020, 0xE007F)  # Emoji tag sequences

        def in_any_range(cp: int, ranges) -> bool:
            for a, b in ranges:
                if a <= cp <= b:
                    return True
            return False

        def is_modifier(cp: int) -> bool:
            return (
                cp in MODIFIERS
                or SKIN_TONE[0] <= cp <= SKIN_TONE[1]
                or TAGS[0] <= cp <= TAGS[1]
            )

        for ch in token:
            cp = ord(ch)
            if not (in_any_range(cp, EMOJI_RANGES) or is_modifier(cp)):
                return False
        return True

`init(config=None)`

Initialize the tokenizer with configuration.

Parameters:

Name	Type	Description	Default
`config`	`Optional[TokenizerConfig]`	Tokenizer configuration. If None, default config will be used.	`None`

Source code in services/tokenizer/core/base.py

def __init__(self, config: Optional[TokenizerConfig] = None):
    """
    Initialize the tokenizer with configuration.

    Args:
        config: Tokenizer configuration. If None, default config will be used.
    """
    self._config = config or TokenizerConfig()

`config` `property`

Get the current tokenizer configuration.

`tokenize(text)` `abstractmethod`

Tokenize input text into a list of tokens.

This is the main tokenization method that all implementations must provide.

Parameters:

Name	Type	Description	Default
`text`	`str`	Input text to tokenize	required

Returns:

Type	Description
`TokenList`	List of tokens extracted from the input text

Source code in services/tokenizer/core/base.py

@abstractmethod
def tokenize(self, text: str) -> TokenList:
    """
    Tokenize input text into a list of tokens.

    This is the main tokenization method that all implementations must provide.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens extracted from the input text
    """
    pass

`CaseHandling`

Bases: Enum

How to handle character case during tokenization.

Source code in services/tokenizer/core/types.py

class CaseHandling(Enum):
    """How to handle character case during tokenization."""

    PRESERVE = "preserve"  # Keep original case
    LOWERCASE = "lowercase"  # Convert to lowercase
    UPPERCASE = "uppercase"  # Convert to uppercase
    NORMALIZE = "normalize"  # Smart case normalization

`LanguageFamily`

Bases: str, Enum

Language families that affect tokenization strategies.

Source code in services/tokenizer/core/types.py

class LanguageFamily(str, Enum):
    """Language families that affect tokenization strategies."""

    LATIN = "latin"  # Space-separated languages (English, French, etc.)
    CJK = "cjk"  # Chinese, Japanese, Korean
    ARABIC = "arabic"  # Arabic script languages
    MIXED = "mixed"  # Mixed content requiring multiple strategies
    UNKNOWN = "unknown"  # Language detection failed or not performed

`TokenType`

Bases: str, Enum

Types of tokens that can be extracted.

Source code in services/tokenizer/core/types.py

class TokenType(str, Enum):
    """Types of tokens that can be extracted."""

    WORD = "word"  # Regular words
    PUNCTUATION = "punctuation"  # Punctuation marks
    NUMERIC = "numeric"  # Numbers
    EMOJI = "emoji"  # Emoji characters
    HASHTAG = "hashtag"  # Social media hashtags
    MENTION = "mention"  # Social media mentions
    URL = "url"  # URLs and links
    EMAIL = "email"  # Email addresses
    WHITESPACE = "whitespace"  # Whitespace (when preserved)

`TokenizerConfig` `pydantic-model`

Bases: BaseModel

Configuration for tokenizer behavior.

Controls all aspects of text tokenization including script handling, social media entity processing, and output formatting.

Social Media Entity Behavior: - extract_hashtags/extract_mentions: When False, splits into component words - include_urls/include_emails: When False, completely excludes (no fragmentation)

Fields:

fallback_language_family (LanguageFamily)
include_punctuation (bool)
include_numeric (bool)
include_emoji (bool)
case_handling (CaseHandling)
normalize_unicode (bool)
extract_hashtags (bool)
extract_mentions (bool)
include_urls (bool)
include_emails (bool)
min_token_length (int)
max_token_length (Optional[int])
strip_whitespace (bool)

Source code in services/tokenizer/core/types.py

class TokenizerConfig(BaseModel):
    """Configuration for tokenizer behavior.

    Controls all aspects of text tokenization including script handling,
    social media entity processing, and output formatting.

    Social Media Entity Behavior:
    - extract_hashtags/extract_mentions: When False, splits into component words
    - include_urls/include_emails: When False, completely excludes (no fragmentation)
    """

    # Language detection settings
    fallback_language_family: LanguageFamily = LanguageFamily.MIXED
    """Default language family when detection fails or mixed content is found."""

    # Token type filtering
    include_punctuation: bool = False
    """Whether to include punctuation marks as separate tokens."""

    include_numeric: bool = True
    """Whether to include numeric tokens (integers, decimals, etc.)."""

    include_emoji: bool = False
    """Whether to include emoji characters as tokens."""

    # Text preprocessing
    case_handling: CaseHandling = CaseHandling.LOWERCASE
    """How to handle character case during tokenization."""

    normalize_unicode: bool = True
    """Whether to apply Unicode NFKC normalization for consistent character representation."""

    # Social media features
    extract_hashtags: bool = True
    """Whether to preserve hashtags as single tokens. If False, splits into component words."""

    extract_mentions: bool = True
    """Whether to preserve @mentions as single tokens. If False, splits into component words."""

    include_urls: bool = True
    """Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented)."""

    include_emails: bool = True
    """Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented)."""

    # Output formatting
    min_token_length: int = 1
    """Minimum length for tokens to be included in output."""

    max_token_length: Optional[int] = None
    """Maximum length for tokens. If None, no length limit is applied."""

    strip_whitespace: bool = True
    """Whether to strip leading/trailing whitespace from tokens."""

`case_handling = CaseHandling.LOWERCASE` `pydantic-field`

How to handle character case during tokenization.

`extract_hashtags = True` `pydantic-field`

Whether to preserve hashtags as single tokens. If False, splits into component words.

`extract_mentions = True` `pydantic-field`

Whether to preserve @mentions as single tokens. If False, splits into component words.

`fallback_language_family = LanguageFamily.MIXED` `pydantic-field`

Default language family when detection fails or mixed content is found.

`include_emails = True` `pydantic-field`

Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented).

`include_emoji = False` `pydantic-field`

Whether to include emoji characters as tokens.

`include_numeric = True` `pydantic-field`

Whether to include numeric tokens (integers, decimals, etc.).

`include_punctuation = False` `pydantic-field`

Whether to include punctuation marks as separate tokens.

`include_urls = True` `pydantic-field`

Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented).

`max_token_length = None` `pydantic-field`

Maximum length for tokens. If None, no length limit is applied.

`min_token_length = 1` `pydantic-field`

Minimum length for tokens to be included in output.

`normalize_unicode = True` `pydantic-field`

Whether to apply Unicode NFKC normalization for consistent character representation.

`strip_whitespace = True` `pydantic-field`

Whether to strip leading/trailing whitespace from tokens.

`base`

AbstractTokenizer abstract base class

This module contains the abstract base class that defines the interface for all tokenizer implementations.

Classes:

Name	Description
`AbstractTokenizer`	Abstract base class for all tokenizer implementations.

`AbstractTokenizer`

Bases: ABC

Abstract base class for all tokenizer implementations.

This class defines the core interface that all tokenizer plugins must implement. It provides a clean contract for tokenization operations while allowing for different implementation strategies.

Methods:

Name	Description
`__init__`	Initialize the tokenizer with configuration.
`tokenize`	Tokenize input text into a list of tokens.

Attributes:

Name	Type	Description
`config`	`TokenizerConfig`	Get the current tokenizer configuration.

Source code in services/tokenizer/core/base.py

class AbstractTokenizer(ABC):
    """
    Abstract base class for all tokenizer implementations.

    This class defines the core interface that all tokenizer plugins must implement.
    It provides a clean contract for tokenization operations while allowing for
    different implementation strategies.
    """

    def __init__(self, config: Optional[TokenizerConfig] = None):
        """
        Initialize the tokenizer with configuration.

        Args:
            config: Tokenizer configuration. If None, default config will be used.
        """
        self._config = config or TokenizerConfig()

    @property
    def config(self) -> TokenizerConfig:
        """Get the current tokenizer configuration."""
        return self._config

    @abstractmethod
    def tokenize(self, text: str) -> TokenList:
        """
        Tokenize input text into a list of tokens.

        This is the main tokenization method that all implementations must provide.

        Args:
            text: Input text to tokenize

        Returns:
            List of tokens extracted from the input text
        """
        pass

    def _preprocess_text(self, text: str) -> str:
        """
        Apply preprocessing to text before tokenization.

        This method applies configuration-based preprocessing such as
        case handling and Unicode normalization.

        Args:
            text: Input text to preprocess

        Returns:
            Preprocessed text
        """
        if not text:
            return text

        # Apply Unicode normalization
        if self._config.normalize_unicode:
            import unicodedata

            text = unicodedata.normalize("NFKC", text)

        # Apply case handling
        from .types import CaseHandling

        if self._config.case_handling == CaseHandling.LOWERCASE:
            text = text.lower()
        elif self._config.case_handling == CaseHandling.UPPERCASE:
            text = text.upper()
        elif self._config.case_handling == CaseHandling.NORMALIZE:
            # TODO: Implement proper noun detection for smart normalization
            # Currently using simple lowercase as a placeholder
            text = text.lower()

        return text

    def _postprocess_tokens(self, tokens: TokenList) -> TokenList:
        """
        Apply post-processing to extracted tokens.

        This method applies configuration-based filtering and cleanup
        to the token list.

        Args:
            tokens: List of raw tokens

        Returns:
            Processed token list
        """
        if not tokens:
            return tokens

        processed_tokens = []

        for token in tokens:
            # Strip whitespace if configured
            if self._config.strip_whitespace:
                token = token.strip()

            # Skip empty tokens
            if not token:
                continue

            # Filter emojis if not included
            if not self._config.include_emoji and self._is_emoji(token):
                continue

            # Apply length filtering
            if len(token) < self._config.min_token_length:
                continue

            if (
                self._config.max_token_length is not None
                and len(token) > self._config.max_token_length
            ):
                continue

            processed_tokens.append(token)

        return processed_tokens

    @staticmethod
    def _is_emoji(token: str) -> bool:
        """
        Check if a token is an emoji character.

        Args:
            token: Token to check

        Returns:
            True if the token is an emoji, False otherwise
        """
        if not token:
            return False

        # Accept sequences made of emoji code points plus common modifiers
        EMOJI_RANGES = (
            (0x1F600, 0x1F64F),  # Emoticons
            (0x1F300, 0x1F5FF),  # Misc Symbols & Pictographs
            (0x1F680, 0x1F6FF),  # Transport & Map
            (0x1F1E6, 0x1F1FF),  # Regional Indicators
            (0x2600, 0x26FF),  # Misc symbols
            (0x2700, 0x27BF),  # Dingbats
            (0x1F900, 0x1F9FF),  # Supplemental Symbols & Pictographs
            (0x1FA70, 0x1FAFF),  # Symbols & Pictographs Extended-A
        )
        MODIFIERS = {0x200D, 0xFE0E, 0xFE0F}  # ZWJ, VS15, VS16
        SKIN_TONE = (0x1F3FB, 0x1F3FF)
        TAGS = (0xE0020, 0xE007F)  # Emoji tag sequences

        def in_any_range(cp: int, ranges) -> bool:
            for a, b in ranges:
                if a <= cp <= b:
                    return True
            return False

        def is_modifier(cp: int) -> bool:
            return (
                cp in MODIFIERS
                or SKIN_TONE[0] <= cp <= SKIN_TONE[1]
                or TAGS[0] <= cp <= TAGS[1]
            )

        for ch in token:
            cp = ord(ch)
            if not (in_any_range(cp, EMOJI_RANGES) or is_modifier(cp)):
                return False
        return True

`init(config=None)`

Initialize the tokenizer with configuration.

Parameters:

Name	Type	Description	Default
`config`	`Optional[TokenizerConfig]`	Tokenizer configuration. If None, default config will be used.	`None`

Source code in services/tokenizer/core/base.py

def __init__(self, config: Optional[TokenizerConfig] = None):
    """
    Initialize the tokenizer with configuration.

    Args:
        config: Tokenizer configuration. If None, default config will be used.
    """
    self._config = config or TokenizerConfig()

`config` `property`

Get the current tokenizer configuration.

`tokenize(text)` `abstractmethod`

Tokenize input text into a list of tokens.

This is the main tokenization method that all implementations must provide.

Parameters:

Name	Type	Description	Default
`text`	`str`	Input text to tokenize	required

Returns:

Type	Description
`TokenList`	List of tokens extracted from the input text

Source code in services/tokenizer/core/base.py

@abstractmethod
def tokenize(self, text: str) -> TokenList:
    """
    Tokenize input text into a list of tokens.

    This is the main tokenization method that all implementations must provide.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens extracted from the input text
    """
    pass

`types`

TokenizerConfig, enums, and shared types

This module contains configuration models, enumerations, and shared type definitions used across the tokenizer service.

Classes:

Name	Description
`CaseHandling`	How to handle character case during tokenization.
`LanguageFamily`	Language families that affect tokenization strategies.
`TokenType`	Types of tokens that can be extracted.
`TokenizerConfig`	Configuration for tokenizer behavior.

`CaseHandling`

Bases: Enum

How to handle character case during tokenization.

Source code in services/tokenizer/core/types.py

class CaseHandling(Enum):
    """How to handle character case during tokenization."""

    PRESERVE = "preserve"  # Keep original case
    LOWERCASE = "lowercase"  # Convert to lowercase
    UPPERCASE = "uppercase"  # Convert to uppercase
    NORMALIZE = "normalize"  # Smart case normalization

`LanguageFamily`

Bases: str, Enum

Language families that affect tokenization strategies.

Source code in services/tokenizer/core/types.py

class LanguageFamily(str, Enum):
    """Language families that affect tokenization strategies."""

    LATIN = "latin"  # Space-separated languages (English, French, etc.)
    CJK = "cjk"  # Chinese, Japanese, Korean
    ARABIC = "arabic"  # Arabic script languages
    MIXED = "mixed"  # Mixed content requiring multiple strategies
    UNKNOWN = "unknown"  # Language detection failed or not performed

`TokenType`

Bases: str, Enum

Types of tokens that can be extracted.

Source code in services/tokenizer/core/types.py

class TokenType(str, Enum):
    """Types of tokens that can be extracted."""

    WORD = "word"  # Regular words
    PUNCTUATION = "punctuation"  # Punctuation marks
    NUMERIC = "numeric"  # Numbers
    EMOJI = "emoji"  # Emoji characters
    HASHTAG = "hashtag"  # Social media hashtags
    MENTION = "mention"  # Social media mentions
    URL = "url"  # URLs and links
    EMAIL = "email"  # Email addresses
    WHITESPACE = "whitespace"  # Whitespace (when preserved)

`TokenizerConfig` `pydantic-model`

Bases: BaseModel

Configuration for tokenizer behavior.

Controls all aspects of text tokenization including script handling, social media entity processing, and output formatting.

Social Media Entity Behavior: - extract_hashtags/extract_mentions: When False, splits into component words - include_urls/include_emails: When False, completely excludes (no fragmentation)

Fields:

fallback_language_family (LanguageFamily)
include_punctuation (bool)
include_numeric (bool)
include_emoji (bool)
case_handling (CaseHandling)
normalize_unicode (bool)
extract_hashtags (bool)
extract_mentions (bool)
include_urls (bool)
include_emails (bool)
min_token_length (int)
max_token_length (Optional[int])
strip_whitespace (bool)

Source code in services/tokenizer/core/types.py

class TokenizerConfig(BaseModel):
    """Configuration for tokenizer behavior.

    Controls all aspects of text tokenization including script handling,
    social media entity processing, and output formatting.

    Social Media Entity Behavior:
    - extract_hashtags/extract_mentions: When False, splits into component words
    - include_urls/include_emails: When False, completely excludes (no fragmentation)
    """

    # Language detection settings
    fallback_language_family: LanguageFamily = LanguageFamily.MIXED
    """Default language family when detection fails or mixed content is found."""

    # Token type filtering
    include_punctuation: bool = False
    """Whether to include punctuation marks as separate tokens."""

    include_numeric: bool = True
    """Whether to include numeric tokens (integers, decimals, etc.)."""

    include_emoji: bool = False
    """Whether to include emoji characters as tokens."""

    # Text preprocessing
    case_handling: CaseHandling = CaseHandling.LOWERCASE
    """How to handle character case during tokenization."""

    normalize_unicode: bool = True
    """Whether to apply Unicode NFKC normalization for consistent character representation."""

    # Social media features
    extract_hashtags: bool = True
    """Whether to preserve hashtags as single tokens. If False, splits into component words."""

    extract_mentions: bool = True
    """Whether to preserve @mentions as single tokens. If False, splits into component words."""

    include_urls: bool = True
    """Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented)."""

    include_emails: bool = True
    """Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented)."""

    # Output formatting
    min_token_length: int = 1
    """Minimum length for tokens to be included in output."""

    max_token_length: Optional[int] = None
    """Maximum length for tokens. If None, no length limit is applied."""

    strip_whitespace: bool = True
    """Whether to strip leading/trailing whitespace from tokens."""

`case_handling = CaseHandling.LOWERCASE` `pydantic-field`

How to handle character case during tokenization.

`extract_hashtags = True` `pydantic-field`

Whether to preserve hashtags as single tokens. If False, splits into component words.

`extract_mentions = True` `pydantic-field`

Whether to preserve @mentions as single tokens. If False, splits into component words.

`fallback_language_family = LanguageFamily.MIXED` `pydantic-field`

Default language family when detection fails or mixed content is found.

`include_emails = True` `pydantic-field`

Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented).

`include_emoji = False` `pydantic-field`

Whether to include emoji characters as tokens.

`include_numeric = True` `pydantic-field`

Whether to include numeric tokens (integers, decimals, etc.).

`include_punctuation = False` `pydantic-field`

Whether to include punctuation marks as separate tokens.

`include_urls = True` `pydantic-field`

Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented).

`max_token_length = None` `pydantic-field`

Maximum length for tokens. If None, no length limit is applied.

`min_token_length = 1` `pydantic-field`

Minimum length for tokens to be included in output.

`normalize_unicode = True` `pydantic-field`

Whether to apply Unicode NFKC normalization for consistent character representation.

`strip_whitespace = True` `pydantic-field`

Whether to strip leading/trailing whitespace from tokens.

`services.tokenizer.basic`

Basic tokenizer implementation.

This module exports the BasicTokenizer implementation that provides fundamental Unicode-aware tokenization capabilities for social media text.

Modules:

Name	Description
`patterns`	Regex patterns for text tokenization.
`tokenizer`	BasicTokenizer implementation.

Classes:

Name	Description
`BasicTokenizer`	Unicode-aware basic tokenizer for social media text.
`TokenizerConfig`	Configuration for tokenizer behavior.

Functions:

Name	Description
`create_basic_tokenizer`	Create a BasicTokenizer with optional configuration.
`get_patterns`	Get global TokenizerPatterns instance.
`tokenize_text`	Simple convenience function for basic text tokenization.

`BasicTokenizer`

Bases: AbstractTokenizer

Unicode-aware basic tokenizer for social media text.

This tokenizer handles mixed-script content, preserves social media entities (@mentions, #hashtags, URLs), and applies appropriate tokenization strategies for different script families.

Methods:

Name	Description
`__init__`	Initialize BasicTokenizer with configuration.
`tokenize`	Tokenize input text into a list of tokens.

Source code in services/tokenizer/basic/tokenizer.py

class BasicTokenizer(AbstractTokenizer):
    """
    Unicode-aware basic tokenizer for social media text.

    This tokenizer handles mixed-script content, preserves social media entities
    (@mentions, #hashtags, URLs), and applies appropriate tokenization strategies
    for different script families.
    """

    def __init__(self, config: Optional[TokenizerConfig] = None):
        """
        Initialize BasicTokenizer with configuration.

        Args:
            config: Tokenizer configuration. If None, default config will be used.
        """
        super().__init__(config)
        self._patterns = get_patterns()

    def tokenize(self, text: str) -> TokenList:
        """
        Tokenize input text into a list of tokens.

        Applies appropriate tokenization strategies for mixed-script content
        while preserving social media entities and handling Unicode correctly.

        Args:
            text: Input text to tokenize

        Returns:
            List of tokens extracted from the input text in document order
        """
        if not text:
            return []

        # Apply preprocessing
        processed_text = self._preprocess_text(text)
        if not processed_text:
            return []

        # Extract tokens using comprehensive regex pattern
        tokens = self._extract_tokens(processed_text)

        # Apply post-processing
        return self._postprocess_tokens(tokens)

    def _extract_tokens(self, text: str) -> TokenList:
        """
        Extract tokens using comprehensive regex patterns.
        Preserves the original order of tokens as they appear in the input text.

        Args:
            text: Preprocessed text to tokenize

        Returns:
            List of extracted tokens in their original order
        """
        return self._extract_tokens_ordered(text, LanguageFamily.MIXED)

    def _is_char_level_script(self, char: str) -> bool:
        """Check if character belongs to a script that uses character-level tokenization (scriptio continua)."""
        code_point = ord(char)
        return (
            (0x4E00 <= code_point <= 0x9FFF)  # CJK Unified Ideographs
            or (0x3400 <= code_point <= 0x4DBF)  # CJK Extension A
            or (0x3040 <= code_point <= 0x309F)  # Hiragana
            or (0x30A0 <= code_point <= 0x30FF)  # Katakana
            or (0xAC00 <= code_point <= 0xD7AF)  # Hangul Syllables
            or (0x0E00 <= code_point <= 0x0E7F)  # Thai
            or (0x0E80 <= code_point <= 0x0EFF)  # Lao
            or (0x1000 <= code_point <= 0x109F)  # Myanmar
            or (0x1780 <= code_point <= 0x17FF)  # Khmer
        )

    def _get_char_script(self, char: str) -> str:
        """
        Get the script family for a character.

        Args:
            char: Character to analyze

        Returns:
            Script family name
        """
        code_point = ord(char)

        # Latin script
        if (
            (0x0041 <= code_point <= 0x007A)
            or (0x00C0 <= code_point <= 0x024F)
            or (0x1E00 <= code_point <= 0x1EFF)
        ):
            return "latin"

        # Character-level scripts (CJK, Thai, etc.)
        elif self._is_char_level_script(char):
            return "cjk"

        # Arabic script
        elif (
            (0x0600 <= code_point <= 0x06FF)
            or (0x0750 <= code_point <= 0x077F)
            or (0x08A0 <= code_point <= 0x08FF)
        ):
            return "arabic"

        else:
            return "other"

    def _extract_tokens_ordered(
        self, text: str, language_family: LanguageFamily
    ) -> TokenList:
        """
        Extract tokens preserving their original order in the text.

        Uses a single comprehensive regex pattern to find ALL tokens in document order,
        eliminating the need for complex segmentation and reassembly logic.
        This is the Phase 2 optimization that removes O(n×segments) complexity.

        Args:
            text: Preprocessed text to tokenize
            language_family: Detected language family for the full text

        Returns:
            List of extracted tokens in their original order
        """
        if not text.strip():
            return []

        # Remove excluded entities (URLs/emails) from text if they are disabled
        # This prevents them from being tokenized into component words
        exclusion_pattern = self._patterns.get_exclusion_pattern(self._config)
        if exclusion_pattern:
            # Replace excluded entities with spaces to maintain word boundaries
            text = exclusion_pattern.sub(" ", text)
            # Clean up multiple spaces
            text = " ".join(text.split())

        if not text.strip():
            return []

        # Get comprehensive pattern based on configuration
        # This single pattern finds ALL tokens in document order
        comprehensive_pattern = self._patterns.get_comprehensive_pattern(self._config)

        # Single regex call gets all tokens in order - this is the key optimization!
        raw_tokens = comprehensive_pattern.findall(text)

        # If no tokens were found but input has content, use fallback for edge cases
        if not raw_tokens and text.strip():
            # For pure punctuation or unrecognized content, return as single token
            # This maintains compatibility with old tokenizer behavior for edge cases
            return [text.strip()]

        # Apply postprocessing for language-specific behavior and configuration filtering
        tokens = []
        for token in raw_tokens:
            if not token.strip():
                continue

            # Clean URLs by removing trailing punctuation
            if self._is_url_like(token):
                token = self._clean_url_token(token)

            # For character-level scripts, break down multi-character tokens into individual characters
            # This maintains compatibility with existing test expectations
            if (
                language_family == LanguageFamily.CJK
                and self._contains_char_level_chars(token)
            ):
                # Only break down pure character-level tokens, not mixed tokens
                if self._is_pure_char_level_token(token):
                    tokens.extend(list(token))
                else:
                    # Mixed token - keep as is but process character-level parts
                    tokens.append(token)
            elif language_family == LanguageFamily.MIXED:
                # For mixed script, break down character-level script parts but keep Latin parts whole
                processed_tokens = self._process_mixed_script_token(token)
                tokens.extend(processed_tokens)
            else:
                tokens.append(token)

        return [token for token in tokens if token.strip()]

    def _is_punctuation_only(self, token: str) -> bool:
        """Check if token contains only punctuation."""
        punctuation_chars = ".!?;:,()[]{}\"'-~`@#$%^&*+=<>/|\\"
        return all(c in punctuation_chars for c in token)

    def _is_numeric_only(self, token: str) -> bool:
        """Check if token is purely numeric."""
        return (
            token.replace(".", "")
            .replace(",", "")
            .replace("%", "")
            .replace("$", "")
            .isdigit()
        )

    def _is_url_like(self, token: str) -> bool:
        """Check if token looks like a URL."""
        # Don't classify emails as URLs
        if self._is_email_like(token):
            return False

        # Explicit URL indicators (http://, https://, www., or protocol markers)
        if token.startswith(("http://", "https://", "www.")) or "://" in token:
            return True

        # Domain-like patterns (e.g., "example.com")
        # But NOT abbreviations (e.g., "U.S.", "c.e.o.s")
        # Heuristic: URLs have at least one period NOT followed by a single uppercase/lowercase letter
        # This allows "example.com" but excludes "U.S." and "c.e.o.s"
        if (
            token.count(".") >= 1
            and any(c.isalpha() for c in token)
            and "@" not in token
        ):
            # Check if this looks like an abbreviation (single letters between periods)
            # Pattern: letter(s).letter(s).letter(s) where segments are 1-3 chars
            abbreviation_pattern = r"^[a-z]{1,3}(?:\.[a-z]{1,3})+\.?$"

            if re.match(abbreviation_pattern, token, re.IGNORECASE):
                return False  # This is an abbreviation, not a URL
            # If it has a period and looks like a domain, it's URL-like
            return True

        return False

    def _is_email_like(self, token: str) -> bool:
        """Check if token looks like an email address."""
        return "@" in token and "." in token and not token.startswith("@")

    def _clean_url_token(self, url_token: str) -> str:
        """Remove trailing punctuation from URL tokens."""
        trailing_punctuation = ".!?;:,)]}\"'"
        return url_token.rstrip(trailing_punctuation)

    def _contains_char_level_chars(self, token: str) -> bool:
        """Check if token contains any character-level script characters."""
        return any(self._is_char_level_script(char) for char in token)

    def _is_pure_char_level_token(self, token: str) -> bool:
        """Check if token contains only character-level script characters."""
        return all(self._is_char_level_script(char) or char.isspace() for char in token)

    def _process_mixed_script_token(self, token: str) -> TokenList:
        """Process mixed script tokens by breaking down character-level script parts."""
        if not self._contains_char_level_chars(token):
            return [token]

        result = []
        current_token = ""
        current_is_cjk = None

        for char in token:
            char_is_cjk = self._is_char_level_script(char)

            if current_is_cjk is None:
                current_is_cjk = char_is_cjk
                current_token = char
            elif char_is_cjk == current_is_cjk:
                current_token += char
            else:
                # Script change
                if current_token.strip():
                    if current_is_cjk and len(current_token) > 1:
                        # Break CJK into individual characters
                        result.extend(list(current_token))
                    else:
                        result.append(current_token)
                current_token = char
                current_is_cjk = char_is_cjk

        # Handle final token
        if current_token.strip():
            if current_is_cjk and len(current_token) > 1:
                result.extend(list(current_token))
            else:
                result.append(current_token)

        return result

    def _postprocess_tokens(self, tokens: TokenList) -> TokenList:
        """
        Apply post-processing to extracted tokens.

        Args:
            tokens: List of raw tokens

        Returns:
            Processed token list
        """
        if not tokens:
            return tokens

        # Apply base class post-processing (length filtering, whitespace stripping, etc.)
        return super()._postprocess_tokens(tokens)

`init(config=None)`

Initialize BasicTokenizer with configuration.

Parameters:

Name	Type	Description	Default
`config`	`Optional[TokenizerConfig]`	Tokenizer configuration. If None, default config will be used.	`None`

Source code in services/tokenizer/basic/tokenizer.py

def __init__(self, config: Optional[TokenizerConfig] = None):
    """
    Initialize BasicTokenizer with configuration.

    Args:
        config: Tokenizer configuration. If None, default config will be used.
    """
    super().__init__(config)
    self._patterns = get_patterns()

`tokenize(text)`

Tokenize input text into a list of tokens.

Applies appropriate tokenization strategies for mixed-script content while preserving social media entities and handling Unicode correctly.

Parameters:

Name	Type	Description	Default
`text`	`str`	Input text to tokenize	required

Returns:

Type	Description
`TokenList`	List of tokens extracted from the input text in document order

Source code in services/tokenizer/basic/tokenizer.py

def tokenize(self, text: str) -> TokenList:
    """
    Tokenize input text into a list of tokens.

    Applies appropriate tokenization strategies for mixed-script content
    while preserving social media entities and handling Unicode correctly.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens extracted from the input text in document order
    """
    if not text:
        return []

    # Apply preprocessing
    processed_text = self._preprocess_text(text)
    if not processed_text:
        return []

    # Extract tokens using comprehensive regex pattern
    tokens = self._extract_tokens(processed_text)

    # Apply post-processing
    return self._postprocess_tokens(tokens)

`TokenizerConfig` `pydantic-model`

Bases: BaseModel

Configuration for tokenizer behavior.

Controls all aspects of text tokenization including script handling, social media entity processing, and output formatting.

Social Media Entity Behavior: - extract_hashtags/extract_mentions: When False, splits into component words - include_urls/include_emails: When False, completely excludes (no fragmentation)

Fields:

fallback_language_family (LanguageFamily)
include_punctuation (bool)
include_numeric (bool)
include_emoji (bool)
case_handling (CaseHandling)
normalize_unicode (bool)
extract_hashtags (bool)
extract_mentions (bool)
include_urls (bool)
include_emails (bool)
min_token_length (int)
max_token_length (Optional[int])
strip_whitespace (bool)

Source code in services/tokenizer/core/types.py

class TokenizerConfig(BaseModel):
    """Configuration for tokenizer behavior.

    Controls all aspects of text tokenization including script handling,
    social media entity processing, and output formatting.

    Social Media Entity Behavior:
    - extract_hashtags/extract_mentions: When False, splits into component words
    - include_urls/include_emails: When False, completely excludes (no fragmentation)
    """

    # Language detection settings
    fallback_language_family: LanguageFamily = LanguageFamily.MIXED
    """Default language family when detection fails or mixed content is found."""

    # Token type filtering
    include_punctuation: bool = False
    """Whether to include punctuation marks as separate tokens."""

    include_numeric: bool = True
    """Whether to include numeric tokens (integers, decimals, etc.)."""

    include_emoji: bool = False
    """Whether to include emoji characters as tokens."""

    # Text preprocessing
    case_handling: CaseHandling = CaseHandling.LOWERCASE
    """How to handle character case during tokenization."""

    normalize_unicode: bool = True
    """Whether to apply Unicode NFKC normalization for consistent character representation."""

    # Social media features
    extract_hashtags: bool = True
    """Whether to preserve hashtags as single tokens. If False, splits into component words."""

    extract_mentions: bool = True
    """Whether to preserve @mentions as single tokens. If False, splits into component words."""

    include_urls: bool = True
    """Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented)."""

    include_emails: bool = True
    """Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented)."""

    # Output formatting
    min_token_length: int = 1
    """Minimum length for tokens to be included in output."""

    max_token_length: Optional[int] = None
    """Maximum length for tokens. If None, no length limit is applied."""

    strip_whitespace: bool = True
    """Whether to strip leading/trailing whitespace from tokens."""

`case_handling = CaseHandling.LOWERCASE` `pydantic-field`

How to handle character case during tokenization.

`extract_hashtags = True` `pydantic-field`

Whether to preserve hashtags as single tokens. If False, splits into component words.

`extract_mentions = True` `pydantic-field`

Whether to preserve @mentions as single tokens. If False, splits into component words.

`fallback_language_family = LanguageFamily.MIXED` `pydantic-field`

Default language family when detection fails or mixed content is found.

`include_emails = True` `pydantic-field`

Whether to include email addresses as tokens. If False, emails are completely excluded (not fragmented).

`include_emoji = False` `pydantic-field`

Whether to include emoji characters as tokens.

`include_numeric = True` `pydantic-field`

Whether to include numeric tokens (integers, decimals, etc.).

`include_punctuation = False` `pydantic-field`

Whether to include punctuation marks as separate tokens.

`include_urls = True` `pydantic-field`

Whether to include URLs as tokens. If False, URLs are completely excluded (not fragmented).

`max_token_length = None` `pydantic-field`

Maximum length for tokens. If None, no length limit is applied.

`min_token_length = 1` `pydantic-field`

Minimum length for tokens to be included in output.

`normalize_unicode = True` `pydantic-field`

Whether to apply Unicode NFKC normalization for consistent character representation.

`strip_whitespace = True` `pydantic-field`

Whether to strip leading/trailing whitespace from tokens.

`create_basic_tokenizer(config=None)`

Create a BasicTokenizer with optional configuration.

Source code in services/tokenizer/basic/__init__.py

def create_basic_tokenizer(config: TokenizerConfig | None = None) -> BasicTokenizer:
    """Create a BasicTokenizer with optional configuration."""
    if config is None:
        config = TokenizerConfig()
    return BasicTokenizer(config)

`get_patterns()`

Get global TokenizerPatterns instance.

Returns:

Type	Description
`TokenizerPatterns`	Singleton TokenizerPatterns instance

Source code in services/tokenizer/basic/patterns.py

def get_patterns() -> TokenizerPatterns:
    """
    Get global TokenizerPatterns instance.

    Returns:
        Singleton TokenizerPatterns instance
    """
    global _global_patterns
    if _global_patterns is None:
        _global_patterns = TokenizerPatterns()
    return _global_patterns

`patterns`

Regex patterns for text tokenization.

This module contains compiled regular expressions for extracting different types of tokens from social media text, with fallback support for both regex and re modules.

Classes:

Name	Description
`TokenizerPatterns`	Compiled regex patterns for tokenization.

Functions:

Name	Description
`get_patterns`	Get global TokenizerPatterns instance.

`TokenizerPatterns`

Compiled regex patterns for tokenization.

Organizes patterns logically and provides efficient compiled regex objects for different token types found in social media text.

Methods:

Name	Description
`__init__`	Initialize and compile all tokenization patterns.
`get_comprehensive_pattern`	Build comprehensive tokenization pattern based on configuration.
`get_exclusion_pattern`	Build pattern to identify and skip excluded entities in text.
`get_pattern`	Get compiled pattern by name.
`list_patterns`	Get list of available pattern names.

Source code in services/tokenizer/basic/patterns.py

class TokenizerPatterns:
    """
    Compiled regex patterns for tokenization.

    Organizes patterns logically and provides efficient compiled regex objects
    for different token types found in social media text.
    """

    def __init__(self):
        """Initialize and compile all tokenization patterns."""
        self._patterns: Dict[str, Any] = {}
        self._compile_patterns()

    def get_pattern(self, pattern_name: str) -> Any:
        """
        Get compiled pattern by name.

        Args:
            pattern_name: Name of the pattern to retrieve

        Returns:
            Compiled regex pattern

        Raises:
            KeyError: If pattern name is not found
        """
        if pattern_name not in self._patterns:
            raise KeyError(f"Pattern '{pattern_name}' not found")
        return self._patterns[pattern_name]

    def get_comprehensive_pattern(self, config) -> Any:
        """
        Build comprehensive tokenization pattern based on configuration.

        This creates a single regex pattern that finds ALL tokens in document order,
        eliminating the need for segmentation and reassembly. URLs and emails are
        conditionally included in the regex itself based on configuration, avoiding
        the need for post-processing filtering.

        Args:
            config: TokenizerConfig specifying which token types to include

        Returns:
            Compiled regex pattern that matches all desired token types in priority order
        """
        pattern_parts = []

        # Conditionally add URL and email patterns based on configuration
        # This eliminates the need for post-processing filtering
        if config.include_urls:
            pattern_parts.append(self.get_pattern("url").pattern)

        if config.include_emails:
            pattern_parts.append(self.get_pattern("email").pattern)

        if config.extract_mentions:
            pattern_parts.append(self.get_pattern("mention").pattern)

        if config.extract_hashtags:
            pattern_parts.append(self.get_pattern("hashtag").pattern)

        if config.include_emoji:
            pattern_parts.append(self.get_pattern("emoji").pattern)

        if config.include_numeric:
            pattern_parts.append(self.get_pattern("numeric").pattern)

        # Always include word pattern (this is the core tokenization)
        pattern_parts.append(self.get_pattern("word").pattern)

        if config.include_punctuation:
            pattern_parts.append(self.get_pattern("punctuation").pattern)

        # Don't add the greedy fallback - let configuration control what gets captured

        # Combine patterns with alternation (| operator)
        comprehensive_pattern = "(?:" + "|".join(pattern_parts) + ")"

        try:
            return REGEX_MODULE.compile(comprehensive_pattern, REGEX_MODULE.IGNORECASE)
        except Exception:
            # Fallback to standard re module
            if REGEX_AVAILABLE and REGEX_MODULE is not re:
                try:
                    return re.compile(comprehensive_pattern, re.IGNORECASE)
                except Exception:
                    # Ultimate fallback - just match words
                    return re.compile(r"\S+", re.IGNORECASE)
            else:
                return re.compile(r"\S+", re.IGNORECASE)

    def get_exclusion_pattern(self, config) -> Any:
        """
        Build pattern to identify and skip excluded entities in text.

        This creates a pattern that matches URLs and emails that should be excluded,
        allowing the tokenizer to skip over them entirely instead of breaking them
        into component words.

        Args:
            config: TokenizerConfig specifying which token types to exclude

        Returns:
            Compiled regex pattern that matches excluded entities, or None if no exclusions
        """
        exclusion_parts = []

        if not config.include_urls:
            exclusion_parts.append(self.get_pattern("url").pattern)

        if not config.include_emails:
            exclusion_parts.append(self.get_pattern("email").pattern)

        if not config.include_numeric:
            exclusion_parts.append(self.get_pattern("numeric").pattern)

        if not exclusion_parts:
            return None

        # Combine exclusion patterns
        exclusion_pattern = "(?:" + "|".join(exclusion_parts) + ")"

        try:
            return REGEX_MODULE.compile(exclusion_pattern, REGEX_MODULE.IGNORECASE)
        except Exception:
            # Fallback to standard re module
            if REGEX_AVAILABLE and REGEX_MODULE is not re:
                try:
                    return re.compile(exclusion_pattern, re.IGNORECASE)
                except Exception:
                    return None
            else:
                return None

    def list_patterns(self) -> List[str]:
        """Get list of available pattern names."""
        return list(self._patterns.keys())

    def _compile_patterns(self):
        """Compile all regex patterns with fallback support."""

        # Compile patterns with fallback handling
        patterns_to_compile = {
            "url": URL_PATTERN,
            "email": EMAIL_PATTERN,
            "mention": MENTION_PATTERN,
            "hashtag": HASHTAG_PATTERN,
            "emoji": EMOJI_PATTERN,
            "numeric": NUMERIC_PATTERN,
            "word": WORD_PATTERN,
            "latin_word": LATIN_WORD_PATTERN,
            "cjk_chars": CJK_PATTERN,
            "arabic_chars": ARABIC_PATTERN,
            "punctuation": PUNCTUATION_PATTERN,
            "social_media": SOCIAL_MEDIA_PATTERN,
            "word_boundary": WORD_BOUNDARY_PATTERN,
            "combined_social_entities": COMBINED_SOCIAL_ENTITIES_PATTERN,
        }

        for name, pattern in patterns_to_compile.items():
            try:
                self._patterns[name] = REGEX_MODULE.compile(
                    pattern, REGEX_MODULE.IGNORECASE
                )
            except Exception:
                # If compilation fails with regex module, fall back to re
                if REGEX_AVAILABLE and REGEX_MODULE is not re:
                    try:
                        self._patterns[name] = re.compile(pattern, re.IGNORECASE)
                    except Exception:
                        # If both fail, create a simple fallback
                        self._patterns[name] = re.compile(r"\S+", re.IGNORECASE)
                else:
                    # Already using re module, create simple fallback
                    self._patterns[name] = re.compile(r"\S+", re.IGNORECASE)

`init()`

Initialize and compile all tokenization patterns.

Source code in services/tokenizer/basic/patterns.py

def __init__(self):
    """Initialize and compile all tokenization patterns."""
    self._patterns: Dict[str, Any] = {}
    self._compile_patterns()

`get_comprehensive_pattern(config)`

Build comprehensive tokenization pattern based on configuration.

This creates a single regex pattern that finds ALL tokens in document order, eliminating the need for segmentation and reassembly. URLs and emails are conditionally included in the regex itself based on configuration, avoiding the need for post-processing filtering.

Parameters:

Name	Type	Description	Default
`config`		TokenizerConfig specifying which token types to include	required

Returns:

Type	Description
`Any`	Compiled regex pattern that matches all desired token types in priority order

Source code in services/tokenizer/basic/patterns.py

def get_comprehensive_pattern(self, config) -> Any:
    """
    Build comprehensive tokenization pattern based on configuration.

    This creates a single regex pattern that finds ALL tokens in document order,
    eliminating the need for segmentation and reassembly. URLs and emails are
    conditionally included in the regex itself based on configuration, avoiding
    the need for post-processing filtering.

    Args:
        config: TokenizerConfig specifying which token types to include

    Returns:
        Compiled regex pattern that matches all desired token types in priority order
    """
    pattern_parts = []

    # Conditionally add URL and email patterns based on configuration
    # This eliminates the need for post-processing filtering
    if config.include_urls:
        pattern_parts.append(self.get_pattern("url").pattern)

    if config.include_emails:
        pattern_parts.append(self.get_pattern("email").pattern)

    if config.extract_mentions:
        pattern_parts.append(self.get_pattern("mention").pattern)

    if config.extract_hashtags:
        pattern_parts.append(self.get_pattern("hashtag").pattern)

    if config.include_emoji:
        pattern_parts.append(self.get_pattern("emoji").pattern)

    if config.include_numeric:
        pattern_parts.append(self.get_pattern("numeric").pattern)

    # Always include word pattern (this is the core tokenization)
    pattern_parts.append(self.get_pattern("word").pattern)

    if config.include_punctuation:
        pattern_parts.append(self.get_pattern("punctuation").pattern)

    # Don't add the greedy fallback - let configuration control what gets captured

    # Combine patterns with alternation (| operator)
    comprehensive_pattern = "(?:" + "|".join(pattern_parts) + ")"

    try:
        return REGEX_MODULE.compile(comprehensive_pattern, REGEX_MODULE.IGNORECASE)
    except Exception:
        # Fallback to standard re module
        if REGEX_AVAILABLE and REGEX_MODULE is not re:
            try:
                return re.compile(comprehensive_pattern, re.IGNORECASE)
            except Exception:
                # Ultimate fallback - just match words
                return re.compile(r"\S+", re.IGNORECASE)
        else:
            return re.compile(r"\S+", re.IGNORECASE)

`get_exclusion_pattern(config)`

Build pattern to identify and skip excluded entities in text.

This creates a pattern that matches URLs and emails that should be excluded, allowing the tokenizer to skip over them entirely instead of breaking them into component words.

Parameters:

Name	Type	Description	Default
`config`		TokenizerConfig specifying which token types to exclude	required

Returns:

Type	Description
`Any`	Compiled regex pattern that matches excluded entities, or None if no exclusions

Source code in services/tokenizer/basic/patterns.py

def get_exclusion_pattern(self, config) -> Any:
    """
    Build pattern to identify and skip excluded entities in text.

    This creates a pattern that matches URLs and emails that should be excluded,
    allowing the tokenizer to skip over them entirely instead of breaking them
    into component words.

    Args:
        config: TokenizerConfig specifying which token types to exclude

    Returns:
        Compiled regex pattern that matches excluded entities, or None if no exclusions
    """
    exclusion_parts = []

    if not config.include_urls:
        exclusion_parts.append(self.get_pattern("url").pattern)

    if not config.include_emails:
        exclusion_parts.append(self.get_pattern("email").pattern)

    if not config.include_numeric:
        exclusion_parts.append(self.get_pattern("numeric").pattern)

    if not exclusion_parts:
        return None

    # Combine exclusion patterns
    exclusion_pattern = "(?:" + "|".join(exclusion_parts) + ")"

    try:
        return REGEX_MODULE.compile(exclusion_pattern, REGEX_MODULE.IGNORECASE)
    except Exception:
        # Fallback to standard re module
        if REGEX_AVAILABLE and REGEX_MODULE is not re:
            try:
                return re.compile(exclusion_pattern, re.IGNORECASE)
            except Exception:
                return None
        else:
            return None

`get_pattern(pattern_name)`

Get compiled pattern by name.

Parameters:

Name	Type	Description	Default
`pattern_name`	`str`	Name of the pattern to retrieve	required

Returns:

Type	Description
`Any`	Compiled regex pattern

Raises:

Type	Description
`KeyError`	If pattern name is not found

Source code in services/tokenizer/basic/patterns.py

def get_pattern(self, pattern_name: str) -> Any:
    """
    Get compiled pattern by name.

    Args:
        pattern_name: Name of the pattern to retrieve

    Returns:
        Compiled regex pattern

    Raises:
        KeyError: If pattern name is not found
    """
    if pattern_name not in self._patterns:
        raise KeyError(f"Pattern '{pattern_name}' not found")
    return self._patterns[pattern_name]

`list_patterns()`

Get list of available pattern names.

Source code in services/tokenizer/basic/patterns.py

def list_patterns(self) -> List[str]:
    """Get list of available pattern names."""
    return list(self._patterns.keys())

`get_patterns()`

Get global TokenizerPatterns instance.

Returns:

Type	Description
`TokenizerPatterns`	Singleton TokenizerPatterns instance

Source code in services/tokenizer/basic/patterns.py

def get_patterns() -> TokenizerPatterns:
    """
    Get global TokenizerPatterns instance.

    Returns:
        Singleton TokenizerPatterns instance
    """
    global _global_patterns
    if _global_patterns is None:
        _global_patterns = TokenizerPatterns()
    return _global_patterns

`tokenize_text(text, config=None)`

Simple convenience function for basic text tokenization.

Source code in services/tokenizer/basic/__init__.py

def tokenize_text(text: str, config: TokenizerConfig | None = None) -> list[str]:
    """Simple convenience function for basic text tokenization."""
    tokenizer = create_basic_tokenizer(config)
    return tokenizer.tokenize(text)

`tokenizer`

BasicTokenizer implementation.

This module contains the main BasicTokenizer class that implements Unicode-aware tokenization for social media text with entity preservation.

Classes:

Name	Description
`BasicTokenizer`	Unicode-aware basic tokenizer for social media text.

`BasicTokenizer`

Bases: AbstractTokenizer

Unicode-aware basic tokenizer for social media text.

This tokenizer handles mixed-script content, preserves social media entities (@mentions, #hashtags, URLs), and applies appropriate tokenization strategies for different script families.

Methods:

Name	Description
`__init__`	Initialize BasicTokenizer with configuration.
`tokenize`	Tokenize input text into a list of tokens.

Source code in services/tokenizer/basic/tokenizer.py

class BasicTokenizer(AbstractTokenizer):
    """
    Unicode-aware basic tokenizer for social media text.

    This tokenizer handles mixed-script content, preserves social media entities
    (@mentions, #hashtags, URLs), and applies appropriate tokenization strategies
    for different script families.
    """

    def __init__(self, config: Optional[TokenizerConfig] = None):
        """
        Initialize BasicTokenizer with configuration.

        Args:
            config: Tokenizer configuration. If None, default config will be used.
        """
        super().__init__(config)
        self._patterns = get_patterns()

    def tokenize(self, text: str) -> TokenList:
        """
        Tokenize input text into a list of tokens.

        Applies appropriate tokenization strategies for mixed-script content
        while preserving social media entities and handling Unicode correctly.

        Args:
            text: Input text to tokenize

        Returns:
            List of tokens extracted from the input text in document order
        """
        if not text:
            return []

        # Apply preprocessing
        processed_text = self._preprocess_text(text)
        if not processed_text:
            return []

        # Extract tokens using comprehensive regex pattern
        tokens = self._extract_tokens(processed_text)

        # Apply post-processing
        return self._postprocess_tokens(tokens)

    def _extract_tokens(self, text: str) -> TokenList:
        """
        Extract tokens using comprehensive regex patterns.
        Preserves the original order of tokens as they appear in the input text.

        Args:
            text: Preprocessed text to tokenize

        Returns:
            List of extracted tokens in their original order
        """
        return self._extract_tokens_ordered(text, LanguageFamily.MIXED)

    def _is_char_level_script(self, char: str) -> bool:
        """Check if character belongs to a script that uses character-level tokenization (scriptio continua)."""
        code_point = ord(char)
        return (
            (0x4E00 <= code_point <= 0x9FFF)  # CJK Unified Ideographs
            or (0x3400 <= code_point <= 0x4DBF)  # CJK Extension A
            or (0x3040 <= code_point <= 0x309F)  # Hiragana
            or (0x30A0 <= code_point <= 0x30FF)  # Katakana
            or (0xAC00 <= code_point <= 0xD7AF)  # Hangul Syllables
            or (0x0E00 <= code_point <= 0x0E7F)  # Thai
            or (0x0E80 <= code_point <= 0x0EFF)  # Lao
            or (0x1000 <= code_point <= 0x109F)  # Myanmar
            or (0x1780 <= code_point <= 0x17FF)  # Khmer
        )

    def _get_char_script(self, char: str) -> str:
        """
        Get the script family for a character.

        Args:
            char: Character to analyze

        Returns:
            Script family name
        """
        code_point = ord(char)

        # Latin script
        if (
            (0x0041 <= code_point <= 0x007A)
            or (0x00C0 <= code_point <= 0x024F)
            or (0x1E00 <= code_point <= 0x1EFF)
        ):
            return "latin"

        # Character-level scripts (CJK, Thai, etc.)
        elif self._is_char_level_script(char):
            return "cjk"

        # Arabic script
        elif (
            (0x0600 <= code_point <= 0x06FF)
            or (0x0750 <= code_point <= 0x077F)
            or (0x08A0 <= code_point <= 0x08FF)
        ):
            return "arabic"

        else:
            return "other"

    def _extract_tokens_ordered(
        self, text: str, language_family: LanguageFamily
    ) -> TokenList:
        """
        Extract tokens preserving their original order in the text.

        Uses a single comprehensive regex pattern to find ALL tokens in document order,
        eliminating the need for complex segmentation and reassembly logic.
        This is the Phase 2 optimization that removes O(n×segments) complexity.

        Args:
            text: Preprocessed text to tokenize
            language_family: Detected language family for the full text

        Returns:
            List of extracted tokens in their original order
        """
        if not text.strip():
            return []

        # Remove excluded entities (URLs/emails) from text if they are disabled
        # This prevents them from being tokenized into component words
        exclusion_pattern = self._patterns.get_exclusion_pattern(self._config)
        if exclusion_pattern:
            # Replace excluded entities with spaces to maintain word boundaries
            text = exclusion_pattern.sub(" ", text)
            # Clean up multiple spaces
            text = " ".join(text.split())

        if not text.strip():
            return []

        # Get comprehensive pattern based on configuration
        # This single pattern finds ALL tokens in document order
        comprehensive_pattern = self._patterns.get_comprehensive_pattern(self._config)

        # Single regex call gets all tokens in order - this is the key optimization!
        raw_tokens = comprehensive_pattern.findall(text)

        # If no tokens were found but input has content, use fallback for edge cases
        if not raw_tokens and text.strip():
            # For pure punctuation or unrecognized content, return as single token
            # This maintains compatibility with old tokenizer behavior for edge cases
            return [text.strip()]

        # Apply postprocessing for language-specific behavior and configuration filtering
        tokens = []
        for token in raw_tokens:
            if not token.strip():
                continue

            # Clean URLs by removing trailing punctuation
            if self._is_url_like(token):
                token = self._clean_url_token(token)

            # For character-level scripts, break down multi-character tokens into individual characters
            # This maintains compatibility with existing test expectations
            if (
                language_family == LanguageFamily.CJK
                and self._contains_char_level_chars(token)
            ):
                # Only break down pure character-level tokens, not mixed tokens
                if self._is_pure_char_level_token(token):
                    tokens.extend(list(token))
                else:
                    # Mixed token - keep as is but process character-level parts
                    tokens.append(token)
            elif language_family == LanguageFamily.MIXED:
                # For mixed script, break down character-level script parts but keep Latin parts whole
                processed_tokens = self._process_mixed_script_token(token)
                tokens.extend(processed_tokens)
            else:
                tokens.append(token)

        return [token for token in tokens if token.strip()]

    def _is_punctuation_only(self, token: str) -> bool:
        """Check if token contains only punctuation."""
        punctuation_chars = ".!?;:,()[]{}\"'-~`@#$%^&*+=<>/|\\"
        return all(c in punctuation_chars for c in token)

    def _is_numeric_only(self, token: str) -> bool:
        """Check if token is purely numeric."""
        return (
            token.replace(".", "")
            .replace(",", "")
            .replace("%", "")
            .replace("$", "")
            .isdigit()
        )

    def _is_url_like(self, token: str) -> bool:
        """Check if token looks like a URL."""
        # Don't classify emails as URLs
        if self._is_email_like(token):
            return False

        # Explicit URL indicators (http://, https://, www., or protocol markers)
        if token.startswith(("http://", "https://", "www.")) or "://" in token:
            return True

        # Domain-like patterns (e.g., "example.com")
        # But NOT abbreviations (e.g., "U.S.", "c.e.o.s")
        # Heuristic: URLs have at least one period NOT followed by a single uppercase/lowercase letter
        # This allows "example.com" but excludes "U.S." and "c.e.o.s"
        if (
            token.count(".") >= 1
            and any(c.isalpha() for c in token)
            and "@" not in token
        ):
            # Check if this looks like an abbreviation (single letters between periods)
            # Pattern: letter(s).letter(s).letter(s) where segments are 1-3 chars
            abbreviation_pattern = r"^[a-z]{1,3}(?:\.[a-z]{1,3})+\.?$"

            if re.match(abbreviation_pattern, token, re.IGNORECASE):
                return False  # This is an abbreviation, not a URL
            # If it has a period and looks like a domain, it's URL-like
            return True

        return False

    def _is_email_like(self, token: str) -> bool:
        """Check if token looks like an email address."""
        return "@" in token and "." in token and not token.startswith("@")

    def _clean_url_token(self, url_token: str) -> str:
        """Remove trailing punctuation from URL tokens."""
        trailing_punctuation = ".!?;:,)]}\"'"
        return url_token.rstrip(trailing_punctuation)

    def _contains_char_level_chars(self, token: str) -> bool:
        """Check if token contains any character-level script characters."""
        return any(self._is_char_level_script(char) for char in token)

    def _is_pure_char_level_token(self, token: str) -> bool:
        """Check if token contains only character-level script characters."""
        return all(self._is_char_level_script(char) or char.isspace() for char in token)

    def _process_mixed_script_token(self, token: str) -> TokenList:
        """Process mixed script tokens by breaking down character-level script parts."""
        if not self._contains_char_level_chars(token):
            return [token]

        result = []
        current_token = ""
        current_is_cjk = None

        for char in token:
            char_is_cjk = self._is_char_level_script(char)

            if current_is_cjk is None:
                current_is_cjk = char_is_cjk
                current_token = char
            elif char_is_cjk == current_is_cjk:
                current_token += char
            else:
                # Script change
                if current_token.strip():
                    if current_is_cjk and len(current_token) > 1:
                        # Break CJK into individual characters
                        result.extend(list(current_token))
                    else:
                        result.append(current_token)
                current_token = char
                current_is_cjk = char_is_cjk

        # Handle final token
        if current_token.strip():
            if current_is_cjk and len(current_token) > 1:
                result.extend(list(current_token))
            else:
                result.append(current_token)

        return result

    def _postprocess_tokens(self, tokens: TokenList) -> TokenList:
        """
        Apply post-processing to extracted tokens.

        Args:
            tokens: List of raw tokens

        Returns:
            Processed token list
        """
        if not tokens:
            return tokens

        # Apply base class post-processing (length filtering, whitespace stripping, etc.)
        return super()._postprocess_tokens(tokens)

`init(config=None)`

Initialize BasicTokenizer with configuration.

Parameters:

Name	Type	Description	Default
`config`	`Optional[TokenizerConfig]`	Tokenizer configuration. If None, default config will be used.	`None`

Source code in services/tokenizer/basic/tokenizer.py

def __init__(self, config: Optional[TokenizerConfig] = None):
    """
    Initialize BasicTokenizer with configuration.

    Args:
        config: Tokenizer configuration. If None, default config will be used.
    """
    super().__init__(config)
    self._patterns = get_patterns()

`tokenize(text)`

Tokenize input text into a list of tokens.

Applies appropriate tokenization strategies for mixed-script content while preserving social media entities and handling Unicode correctly.

Parameters:

Name	Type	Description	Default
`text`	`str`	Input text to tokenize	required

Returns:

Type	Description
`TokenList`	List of tokens extracted from the input text in document order

Source code in services/tokenizer/basic/tokenizer.py

def tokenize(self, text: str) -> TokenList:
    """
    Tokenize input text into a list of tokens.

    Applies appropriate tokenization strategies for mixed-script content
    while preserving social media entities and handling Unicode correctly.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens extracted from the input text in document order
    """
    if not text:
        return []

    # Apply preprocessing
    processed_text = self._preprocess_text(text)
    if not processed_text:
        return []

    # Extract tokens using comprehensive regex pattern
    tokens = self._extract_tokens(processed_text)

    # Apply post-processing
    return self._postprocess_tokens(tokens)

Tokenizer

services.tokenizer.core

AbstractTokenizer

__init__(config=None)

config

config property

tokenize(text) abstractmethod

text

CaseHandling

LanguageFamily

TokenType

TokenizerConfig pydantic-model

case_handling = CaseHandling.LOWERCASE pydantic-field

extract_hashtags = True pydantic-field

extract_mentions = True pydantic-field

fallback_language_family = LanguageFamily.MIXED pydantic-field

include_emails = True pydantic-field

include_emoji = False pydantic-field

include_numeric = True pydantic-field

include_punctuation = False pydantic-field

include_urls = True pydantic-field

max_token_length = None pydantic-field

min_token_length = 1 pydantic-field

normalize_unicode = True pydantic-field

strip_whitespace = True pydantic-field

base

AbstractTokenizer

__init__(config=None)

config

config property

tokenize(text) abstractmethod

text

types

CaseHandling

LanguageFamily

TokenType

TokenizerConfig pydantic-model

case_handling = CaseHandling.LOWERCASE pydantic-field

extract_hashtags = True pydantic-field

extract_mentions = True pydantic-field

fallback_language_family = LanguageFamily.MIXED pydantic-field

include_emails = True pydantic-field

include_emoji = False pydantic-field

include_numeric = True pydantic-field

include_punctuation = False pydantic-field

include_urls = True pydantic-field

max_token_length = None pydantic-field

min_token_length = 1 pydantic-field

normalize_unicode = True pydantic-field

strip_whitespace = True pydantic-field

services.tokenizer.basic

BasicTokenizer

__init__(config=None)

config

tokenize(text)

text

TokenizerConfig pydantic-model

case_handling = CaseHandling.LOWERCASE pydantic-field

extract_hashtags = True pydantic-field

extract_mentions = True pydantic-field

fallback_language_family = LanguageFamily.MIXED pydantic-field

include_emails = True pydantic-field

include_emoji = False pydantic-field

include_numeric = True pydantic-field

include_punctuation = False pydantic-field

include_urls = True pydantic-field

max_token_length = None pydantic-field

min_token_length = 1 pydantic-field

normalize_unicode = True pydantic-field

strip_whitespace = True pydantic-field

create_basic_tokenizer(config=None)

get_patterns()

patterns

TokenizerPatterns

__init__()

get_comprehensive_pattern(config)

config

get_exclusion_pattern(config)

config

get_pattern(pattern_name)

`services.tokenizer.core`

`AbstractTokenizer`

`init(config=None)`

`config`

`config` `property`

`tokenize(text)` `abstractmethod`

`text`

`CaseHandling`

`LanguageFamily`

`TokenType`

`TokenizerConfig` `pydantic-model`

`case_handling = CaseHandling.LOWERCASE` `pydantic-field`

`extract_hashtags = True` `pydantic-field`

`extract_mentions = True` `pydantic-field`

`fallback_language_family = LanguageFamily.MIXED` `pydantic-field`

`include_emails = True` `pydantic-field`

`include_emoji = False` `pydantic-field`

`include_numeric = True` `pydantic-field`

`include_punctuation = False` `pydantic-field`

`include_urls = True` `pydantic-field`

`max_token_length = None` `pydantic-field`

`min_token_length = 1` `pydantic-field`

`normalize_unicode = True` `pydantic-field`

`strip_whitespace = True` `pydantic-field`

`base`

`AbstractTokenizer`

`init(config=None)`

`config`

`config` `property`

`tokenize(text)` `abstractmethod`

`text`

`types`

`CaseHandling`

`LanguageFamily`

`TokenType`

`TokenizerConfig` `pydantic-model`

`case_handling = CaseHandling.LOWERCASE` `pydantic-field`

`extract_hashtags = True` `pydantic-field`

`extract_mentions = True` `pydantic-field`

`fallback_language_family = LanguageFamily.MIXED` `pydantic-field`

`include_emails = True` `pydantic-field`

`include_emoji = False` `pydantic-field`

`include_numeric = True` `pydantic-field`

`include_punctuation = False` `pydantic-field`

`include_urls = True` `pydantic-field`

`max_token_length = None` `pydantic-field`

`min_token_length = 1` `pydantic-field`

`normalize_unicode = True` `pydantic-field`

`strip_whitespace = True` `pydantic-field`

`services.tokenizer.basic`

`BasicTokenizer`

`init(config=None)`

`config`

`tokenize(text)`

`text`

`TokenizerConfig` `pydantic-model`

`case_handling = CaseHandling.LOWERCASE` `pydantic-field`

`extract_hashtags = True` `pydantic-field`

`extract_mentions = True` `pydantic-field`

`fallback_language_family = LanguageFamily.MIXED` `pydantic-field`

`include_emails = True` `pydantic-field`

`include_emoji = False` `pydantic-field`

`include_numeric = True` `pydantic-field`

`include_punctuation = False` `pydantic-field`

`include_urls = True` `pydantic-field`

`max_token_length = None` `pydantic-field`

`min_token_length = 1` `pydantic-field`

`normalize_unicode = True` `pydantic-field`

`strip_whitespace = True` `pydantic-field`

`create_basic_tokenizer(config=None)`

`get_patterns()`

`patterns`

`TokenizerPatterns`

`init()`

`get_comprehensive_pattern(config)`

`config`

`get_exclusion_pattern(config)`

`config`

`get_pattern(pattern_name)`

`pattern_name`