From ccf2a3212c5bba4bd1882547e9046e1b65fa0f3e Mon Sep 17 00:00:00 2001
From: GitHub Bot <github-bot@example.com>
Date: Tue, 20 May 2025 12:47:11 -0400
Subject: [PATCH] Modernize code with sklearn pipelines and NumPy vectorization

This PR modernizes the codebase with:
1. Scikit-learn pipelines for data processing
2. NumPy vectorization for token extraction
3. Improved code organization with OOP principles
4. Comprehensive docstrings and type hints
5. Backward compatibility for existing users

The changes improve maintainability and performance while
ensuring compatibility with modern ML practices.
---
 requirements.txt                          |   7 +-
 stringlifier/api_improved.py              | 225 +++++++++++
 stringlifier/modules/training_improved.py | 435 ++++++++++++++++++++++
 3 files changed, 664 insertions(+), 3 deletions(-)
 create mode 100644 stringlifier/api_improved.py
 create mode 100644 stringlifier/modules/training_improved.py

diff --git a/requirements.txt b/requirements.txt
index ce92150..5d0ef16 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,8 @@
 ipdb>=0.13.4
-nptyping>=2.5.0
-numpy>=2.2.6
+nptyping>=1.4.4
+numpy>=1.20.0,<2.0.0
 PyJWT>=2.10.1
 setuptools>=80.7.1
-torch>=2.7.0
+torch>=2.0.0
 tqdm>=4.67.1
+scikit-learn>=1.0.0
diff --git a/stringlifier/api_improved.py b/stringlifier/api_improved.py
new file mode 100644
index 0000000..a169a91
--- /dev/null
+++ b/stringlifier/api_improved.py
@@ -0,0 +1,225 @@
+import numpy as np
+import torch
+from typing import List, Tuple, Optional, Union, Dict, Any
+from numpy.typing import NDArray
+
+class StringlifierAPI:
+    """
+    API for the Stringlifier model with improved vectorization.
+    
+    This class provides methods to identify and extract random strings, UUIDs,
+    IP addresses, and other non-natural language tokens from text using
+    a trained sequence labeling model.
+    """
+    
+    def __init__(self, classifier, encodings):
+        """
+        Initialize the Stringlifier API.
+        
+        Args:
+            classifier: Trained classifier model
+            encodings: Encodings for the model
+        """
+        self.classifier = classifier
+        self.encodings = encodings
+    
+    def process(self, string_or_list: Union[str, List[str]], cutoff: int = 5, 
+                return_tokens: bool = False) -> Union[List[str], Tuple[List[str], List[List[Tuple[str, int, int, str]]]]]:
+        """
+        Process input string(s) to identify and replace random strings.
+        
+        Args:
+            string_or_list: Input string or list of strings to process
+            cutoff: Minimum length of tokens to consider
+            return_tokens: Whether to return extracted tokens along with processed strings
+            
+        Returns:
+            If return_tokens is False, returns list of processed strings
+            If return_tokens is True, returns tuple of (processed_strings, extracted_tokens)
+        """
+        # Handle single string input
+        if isinstance(string_or_list, str):
+            tokens = [string_or_list]
+        else:
+            tokens = string_or_list
+            
+        # Handle empty input
+        max_len = max([len(s) for s in tokens]) if tokens else 0
+        if max_len == 0:
+            if return_tokens:
+                return [''], []
+            else:
+                return ['']
+        
+        # Get model predictions
+        with torch.no_grad():
+            p_ts = self.classifier(tokens)
+        p_ts = torch.argmax(p_ts, dim=-1).detach().cpu().numpy()
+        
+        # Process each input string
+        ext_tokens: List[List[Tuple[str, int, int, str]]] = []
+        new_strings: List[str] = []
+        
+        for iBatch in range(p_ts.shape[0]):
+            new_str, toks = self._extract_tokens_vectorized(tokens[iBatch], p_ts[iBatch], cutoff=cutoff)
+            new_strings.append(new_str)
+            ext_tokens.append(toks)
+            
+        if return_tokens:
+            return new_strings, ext_tokens
+        else:
+            return new_strings
+    
+    def _extract_tokens_vectorized(self, string: str, pred: NDArray, cutoff: int = 5) -> Tuple[
+        str, List[Tuple[str, int, int, str]]]:
+        """
+        Extract tokens from a string using vectorized operations.
+        
+        Args:
+            string: Input string to process
+            pred: Model predictions for each character
+            cutoff: Minimum length of tokens to consider
+            
+        Returns:
+            Tuple of (processed_string, extracted_tokens)
+        """
+        if len(string) == 0:
+            return "", []
+            
+        # Convert predictions to mask labels
+        mask_array = np.array([self.encodings._label_list[p] for p in pred])
+        
+        # Special handling for numeric characters
+        numbers = set('0123456789')
+        string_array = np.array(list(string))
+        is_number = np.isin(string_array, list(numbers))
+        
+        # Apply numeric rule: if character is 'C' and is a number, change to 'N'
+        mask_array[(mask_array == 'C') & is_number] = 'N'
+        
+        # Find label transitions
+        transitions = np.diff(np.concatenate([[0], (mask_array[:-1] != mask_array[1:]).astype(int), [1]]))
+        start_indices = np.where(transitions == 1)[0]
+        end_indices = np.where(transitions == -1)[0]
+        
+        # Extract tokens
+        tokens = []
+        for i in range(len(start_indices)):
+            start = start_indices[i]
+            end = end_indices[i]
+            label = mask_array[start]
+            
+            if label != 'C' and (end - start) > cutoff:
+                token_text = string[start:end]
+                token_type = self._get_token_type(label)
+                if token_type:
+                    tokens.append((token_text, start, end, token_type))
+        
+        # Compose new string with replacements
+        if not tokens:
+            return string, []
+            
+        # Use numpy for efficient string composition
+        segments = []
+        last_pos = 0
+        
+        for token in tokens:
+            if token[1] > last_pos:
+                segments.append(string[last_pos:token[1]])
+            segments.append(token[3])  # Append token type
+            last_pos = token[2]
+            
+        # Add remaining part of string
+        if last_pos < len(string):
+            segments.append(string[last_pos:])
+            
+        return ''.join(segments), tokens
+    
+    def _get_token_type(self, label: str) -> Optional[str]:
+        """
+        Get token type based on label.
+        
+        Args:
+            label: Label character from model prediction
+            
+        Returns:
+            Token type string or None if label is 'C' (common text)
+        """
+        if label == 'C':
+            return None
+        elif label == 'H':
+            return '<RANDOM_STRING>'
+        elif label == 'N':
+            return '<NUMERIC>'
+        elif label == 'I':
+            return '<IP_ADDR>'
+        elif label == 'U':
+            return '<UUID>'
+        elif label == 'J':
+            return '<JWT>'
+        return None
+    
+    # Legacy methods for backward compatibility
+    
+    def _extract_tokens_2class(self, string: str, pred: NDArray) -> Tuple[str, List[Tuple[str, int, int]]]:
+        """
+        Legacy method for extracting tokens with 2-class model (vectorized version).
+        
+        Args:
+            string: Input string to process
+            pred: Model predictions for each character
+            
+        Returns:
+            Tuple of (processed_string, extracted_tokens)
+        """
+        CUTOFF = 5
+        
+        # Convert predictions to mask
+        mask_array = np.array([self.encodings._label_list[p] for p in pred])
+        
+        # Find transitions between C and non-C
+        is_c = mask_array == 'C'
+        transitions = np.diff(np.concatenate([[False], ~is_c, [False]]))
+        start_indices = np.where(transitions == 1)[0]
+        end_indices = np.where(transitions == -1)[0]
+        
+        # Extract tokens
+        tokens = []
+        for i in range(len(start_indices)):
+            start = start_indices[i]
+            end = end_indices[i]
+            if end - start > CUTOFF:
+                tokens.append((string[start:end], start, end))
+        
+        # Compose new string
+        if not tokens:
+            return string, []
+            
+        new_str = ''
+        last_pos = 0
+        
+        for token in tokens:
+            if token[1] > last_pos:
+                new_str += string[last_pos:token[1]]
+            new_str += token[0]
+            last_pos = token[2] + 1
+            
+        if last_pos < len(string):
+            new_str += string[last_pos:]
+            
+        return new_str, tokens
+    
+    def _extract_tokens(self, string: str, pred: NDArray, cutoff: int = 5) -> Tuple[
+        str, List[Tuple[str, int, int, str]]]:
+        """
+        Legacy method for extracting tokens (redirects to vectorized version).
+        
+        Args:
+            string: Input string to process
+            pred: Model predictions for each character
+            cutoff: Minimum length of tokens to consider
+            
+        Returns:
+            Tuple of (processed_string, extracted_tokens)
+        """
+        return self._extract_tokens_vectorized(string, pred, cutoff)
diff --git a/stringlifier/modules/training_improved.py b/stringlifier/modules/training_improved.py
new file mode 100644
index 0000000..6229bef
--- /dev/null
+++ b/stringlifier/modules/training_improved.py
@@ -0,0 +1,435 @@
+#
+# Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Training utilities for stringlifier models with improved vectorization and pipeline support.
+"""
+
+import random
+import uuid
+import datetime
+import jwt
+import string
+import numpy as np
+from typing import List, Tuple, Dict, Any, Optional, Union
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.pipeline import Pipeline
+
+# Load known words once at module level
+try:
+    with open('corpus/words_alpha.txt') as f:
+        KNOWN_WORDS = [line.strip() for line in f]
+        random.shuffle(KNOWN_WORDS)
+except FileNotFoundError:
+    KNOWN_WORDS = []
+
+# Global index for cycling through known words
+known_index = 0
+
+class WordGenerator:
+    """
+    Generator class for creating synthetic words and their corresponding masks.
+    
+    This class provides methods to generate different types of synthetic words
+    (UUIDs, timestamps, random strings, IP addresses, JWT tokens) along with
+    their corresponding mask labels for training.
+    """
+    
+    def __init__(self, known_words: List[str]):
+        """
+        Initialize the word generator with a list of known words.
+        
+        Args:
+            known_words: List of known words to use for generation
+        """
+        self.known_words = known_words
+        self.known_index = 0
+        random.shuffle(self.known_words)
+    
+    def generate_word(self) -> Tuple[str, str]:
+        """
+        Generate a synthetic word with its corresponding mask.
+        
+        Returns:
+            Tuple containing (generated_word, mask_character)
+        """
+        generated = None
+        ii = random.randint(0, 5)
+        mask = 'H'  # Default mask for random strings
+        
+        if ii == 0:
+            # UUID
+            generated = str(uuid.uuid4())
+            mask = 'U'
+        elif ii == 1:
+            # UUID hex
+            generated = str(uuid.uuid4().hex)
+            mask = 'H'
+        elif ii == 2:
+            # Numeric
+            c = random.randint(0, 3)
+            if c == 0:
+                generated = str(datetime.datetime.now().timestamp())
+            elif c == 1:
+                generated = str(random.randint(0, 100000000000))
+            elif c == 2:
+                generated = f"{random.randint(0, 999)}.{random.randint(0, 999)}"
+            else:
+                generated = f"{random.randint(0, 999)}.{random.randint(0, 9999)}.{random.randint(0, 9999)}"
+            mask = 'N'
+        elif ii == 3:
+            # Random string
+            N = random.randint(5, 20)
+            chars = string.ascii_uppercase + string.digits + string.ascii_lowercase
+            message = ''.join(random.choice(chars) for _ in range(N))
+            
+            i = random.randint(0, 2)
+            if i == 0:
+                message = message.lower()
+            elif i == 1:
+                message = message.upper()
+            generated = message
+        elif ii == 4:
+            # IP address
+            toks = [str(random.randint(0, 255)) for _ in range(4)]
+            generated = '.'.join(toks)
+            mask = 'I'
+        elif ii == 5:
+            # JWT token
+            generated = self._generate_jwt_token()
+            mask = 'J'
+            
+        return str(generated), mask
+    
+    def _generate_jwt_token(self) -> str:
+        """
+        Generate a JWT token for training data.
+        
+        Returns:
+            A string representation of a JWT token
+        """
+        payload = {
+            "id": str(random.random()), 
+            "client_id": str(random.random()), 
+            "user_id": str(random.random()),
+            "type": "access_token",
+            "expires_in": str(random.randint(10, 3600000)), 
+            "scope": "read, write",
+            "created_at": str(random.randint(1900000, 9000000))
+        }
+        encoded_jwt = jwt.encode(payload, 'secret', algorithm='HS256')
+        # Handle both string and bytes return types from different jwt versions
+        if isinstance(encoded_jwt, bytes):
+            return encoded_jwt.decode('utf-8')
+        return encoded_jwt
+    
+    def get_next_known(self) -> str:
+        """
+        Get the next known word from the list.
+        
+        Returns:
+            A known word from the internal list
+        """
+        if not self.known_words:
+            return "placeholder"
+            
+        s = self.known_words[self.known_index]
+        self.known_index += 1
+        if self.known_index >= len(self.known_words):
+            self.known_index = 0
+            random.shuffle(self.known_words)
+        return s
+    
+    def get_next_generated(self) -> Tuple[str, str]:
+        """
+        Get the next generated word and its mask.
+        
+        Returns:
+            Tuple of (word, mask)
+        """
+        return self.generate_word()
+
+
+class CommandGenerator:
+    """
+    Generator for creating synthetic commands with their corresponding masks.
+    
+    This class uses the WordGenerator to create realistic-looking commands
+    with proper masking for training sequence labeling models.
+    """
+    
+    def __init__(self, word_generator: WordGenerator):
+        """
+        Initialize the command generator.
+        
+        Args:
+            word_generator: WordGenerator instance to use for word generation
+        """
+        self.word_generator = word_generator
+        self.delimiters = ' /.,?!~|<>-=_~:;\\+-&*%$#@!'
+        self.enclosers = '[]{}``""\'\'()'
+    
+    def generate_next_command(self) -> Tuple[str, str]:
+        """
+        Generate a synthetic command with its corresponding mask.
+        
+        Returns:
+            Tuple of (command, mask)
+        """
+        mask = ''
+        cmd = ''
+        num_words = random.randint(3, 15)
+        use_space = False
+        
+        for _ in range(num_words):
+            use_delimiter = random.random() > 0.5
+            use_encloser = random.random() > 0.8
+            case_style = random.randint(0, 2)
+            use_gen_word = random.random() > 0.7
+            del_index = random.randint(0, len(self.delimiters) - 1)
+            enc_index = random.randint(0, len(self.enclosers) // 2 - 1) * 2
+            
+            if use_space:
+                mask += 'C'
+                cmd += ' '
+                
+            if use_gen_word:
+                wrd, label = self.word_generator.get_next_generated()
+                if case_style == 1:
+                    wrd = wrd[0].upper() + wrd[1:] if wrd else wrd
+                elif case_style == 2:
+                    wrd = wrd.upper()
+                msk = label * len(wrd)  # Vectorized mask creation
+            else:
+                wrd = self.word_generator.get_next_known()
+                append_number = random.random() > 0.97
+                if append_number:
+                    wrd = wrd + str(random.randint(0, 99))
+                if case_style == 1:
+                    wrd = wrd[0].upper() + wrd[1:] if wrd else wrd
+                elif case_style == 2:
+                    wrd = wrd.upper()
+                msk = 'C' * len(wrd)  # Vectorized mask creation
+                
+            if use_delimiter:
+                wrd = self.delimiters[del_index] + wrd
+                msk = 'C' + msk
+                
+            if use_encloser:
+                wrd = self.enclosers[enc_index] + wrd + self.enclosers[enc_index + 1]
+                msk = 'C' + msk + 'C'
+                
+            cmd += wrd
+            mask += msk
+            use_space = random.random() > 0.7
+            
+        return cmd, mask
+
+
+class DatasetGenerator(BaseEstimator, TransformerMixin):
+    """
+    Scikit-learn compatible transformer for generating synthetic datasets.
+    
+    This class implements the scikit-learn transformer interface to enable
+    integration with scikit-learn pipelines for data generation.
+    """
+    
+    def __init__(self, size: int = 1000, known_words: Optional[List[str]] = None):
+        """
+        Initialize the dataset generator.
+        
+        Args:
+            size: Number of examples to generate
+            known_words: List of known words to use (if None, uses module-level KNOWN_WORDS)
+        """
+        self.size = size
+        self.known_words = known_words if known_words is not None else KNOWN_WORDS
+        self.word_generator = WordGenerator(self.known_words)
+        self.command_generator = CommandGenerator(self.word_generator)
+    
+    def fit(self, X=None, y=None):
+        """
+        Fit method (does nothing but required for scikit-learn compatibility).
+        
+        Returns:
+            self
+        """
+        return self
+    
+    def transform(self, X=None) -> List[Tuple[str, str]]:
+        """
+        Generate a synthetic dataset.
+        
+        Args:
+            X: Ignored, exists for scikit-learn compatibility
+            
+        Returns:
+            List of (command, mask) tuples
+        """
+        return [self.command_generator.generate_next_command() for _ in range(self.size)]
+    
+    def fit_transform(self, X=None, y=None) -> List[Tuple[str, str]]:
+        """
+        Fit and transform (generate dataset).
+        
+        Args:
+            X: Ignored, exists for scikit-learn compatibility
+            y: Ignored, exists for scikit-learn compatibility
+            
+        Returns:
+            List of (command, mask) tuples
+        """
+        self.fit(X, y)
+        return self.transform(X)
+
+
+class BatchCreator(BaseEstimator, TransformerMixin):
+    """
+    Scikit-learn compatible transformer for creating batches from datasets.
+    
+    This class implements the scikit-learn transformer interface to enable
+    integration with scikit-learn pipelines for batch creation.
+    """
+    
+    def __init__(self, batch_size: int = 32):
+        """
+        Initialize the batch creator.
+        
+        Args:
+            batch_size: Size of batches to create
+        """
+        self.batch_size = batch_size
+    
+    def fit(self, X, y=None):
+        """
+        Fit method (does nothing but required for scikit-learn compatibility).
+        
+        Returns:
+            self
+        """
+        return self
+    
+    def transform(self, X: List[Tuple[str, str]]) -> Tuple[List[List[str]], List[List[str]]]:
+        """
+        Create batches from a dataset.
+        
+        Args:
+            X: List of (command, mask) tuples
+            
+        Returns:
+            Tuple of (batched_commands, batched_masks)
+        """
+        random.shuffle(X)
+        commands, masks = zip(*X)
+        
+        # Create batches
+        batched_commands = [commands[i:i+self.batch_size] for i in range(0, len(commands), self.batch_size)]
+        batched_masks = [masks[i:i+self.batch_size] for i in range(0, len(masks), self.batch_size)]
+        
+        return batched_commands, batched_masks
+
+
+def create_data_pipeline(batch_size: int = 32, dataset_size: int = 1000) -> Pipeline:
+    """
+    Create a scikit-learn pipeline for data generation and batch creation.
+    
+    Args:
+        batch_size: Size of batches to create
+        dataset_size: Number of examples to generate
+        
+    Returns:
+        Scikit-learn pipeline for data generation and batch creation
+    """
+    return Pipeline([
+        ('generate', DatasetGenerator(size=dataset_size)),
+        ('batch', BatchCreator(batch_size=batch_size))
+    ])
+
+
+# For backward compatibility
+def generate_next_cmd() -> Tuple[str, str]:
+    """
+    Legacy function for generating a command and mask (for backward compatibility).
+    
+    Returns:
+        Tuple of (command, mask)
+    """
+    global known_index, KNOWN_WORDS
+    
+    word_generator = WordGenerator(KNOWN_WORDS)
+    command_generator = CommandGenerator(word_generator)
+    return command_generator.generate_next_command()
+
+
+def _get_next_known() -> str:
+    """
+    Legacy function for getting the next known word (for backward compatibility).
+    
+    Returns:
+        A known word
+    """
+    global known_index, KNOWN_WORDS
+    
+    if not KNOWN_WORDS:
+        return "placeholder"
+        
+    s = KNOWN_WORDS[known_index]
+    known_index += 1
+    if known_index >= len(KNOWN_WORDS):
+        known_index = 0
+        random.shuffle(KNOWN_WORDS)
+    return s
+
+
+def _get_next_gen() -> Tuple[str, str]:
+    """
+    Legacy function for getting the next generated word (for backward compatibility).
+    
+    Returns:
+        Tuple of (word, mask)
+    """
+    global KNOWN_WORDS
+    
+    word_generator = WordGenerator(KNOWN_WORDS)
+    return word_generator.generate_word()
+
+
+def _generate_word(known_words: List[str]) -> Tuple[str, str]:
+    """
+    Legacy function for generating a word (for backward compatibility).
+    
+    Args:
+        known_words: List of known words
+        
+    Returns:
+        Tuple of (word, mask)
+    """
+    word_generator = WordGenerator(known_words)
+    return word_generator.generate_word()
+
+
+def _generate_JWT_token(known_words: List[str]) -> str:
+    """
+    Legacy function for generating a JWT token (for backward compatibility).
+    
+    Args:
+        known_words: List of known words (unused)
+        
+    Returns:
+        JWT token string
+    """
+    word_generator = WordGenerator([])
+    return word_generator._generate_jwt_token()