From 040e44664c6c6b4845fb88625670d358cf8ed13d Mon Sep 17 00:00:00 2001 From: iamprecieee Date: Fri, 21 Feb 2025 14:43:16 +0100 Subject: [PATCH 1/2] chore(analyzer): change example suggestion code format - changed example code from inline code formatting to codeblock --- src/core/analyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/analyzer.py b/src/core/analyzer.py index 7783f17..8ab0c50 100644 --- a/src/core/analyzer.py +++ b/src/core/analyzer.py @@ -82,7 +82,7 @@ def _check_format(self, message: str) -> list[CommitIssue]: CommitIssue( severity="high", message="Invalid commit type", - suggestion=f"Use '{likely_type}' for this kind of change\nExample: `{self.example_commits[likely_type]}`", + suggestion=f"Use '{likely_type}' for this kind of change\n└─ Example:\n• ```{self.example_commits[likely_type]}```", ) ] return [] @@ -232,4 +232,4 @@ def format_analysis(self, commit: dict, issues: list[CommitIssue]) -> str: "└─ Git Best Practices: " ) - return f"{commit_details}{analysis_section}{suggestions}" \ No newline at end of file + return f"{commit_details}{analysis_section}{suggestions}" From 356196e3d9b3045c83b0625cd318b1c032714757 Mon Sep 17 00:00:00 2001 From: iamprecieee Date: Fri, 21 Feb 2025 17:55:26 +0100 Subject: [PATCH 2/2] refactor(analyzer): better gibberish detection and cleanup - Added improved gibberish detection method - Properly documented all 4 detection processes --- src/config/data.py | 15 ++++++++++ src/core/analyzer.py | 69 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 14 deletions(-) diff --git a/src/config/data.py b/src/config/data.py index 740bc72..5a0f5ce 100644 --- a/src/config/data.py +++ b/src/config/data.py @@ -395,3 +395,18 @@ ("streamline", "delivery"), ], } + +VALID_PAIRS = { + "bl", "br", "ch", "cl", "cr", "dr", "fl", "fr", "gl", "gr", + "ph", "pl", "pr", "sc", "sh", "sk", "sl", "sm", "sn", "sp", + "st", "sw", "th", "tr", "tw", "wh", "wr" +} + +LETTER_FREQUENCY = { + "e": 12.7, "t": 9.1, "a": 8.2, "o": 7.5, "i": 7.0, + "n": 6.7, "s": 6.3, "h": 6.1, "r": 6.0, "d": 4.3, + "l": 4.0, "c": 2.8, "u": 2.8, "m": 2.4, "w": 2.4, + "f": 2.2, "g": 2.0, "y": 2.0, "p": 1.9, "b": 1.5, + "v": 0.98, "k": 0.77, "j": 0.15, "x": 0.15, + "q": 0.095, "z": 0.074 +} \ No newline at end of file diff --git a/src/core/analyzer.py b/src/core/analyzer.py index 8ab0c50..2469f23 100644 --- a/src/core/analyzer.py +++ b/src/core/analyzer.py @@ -4,6 +4,8 @@ example_commits, commit_training_data, semantic_patterns, + VALID_PAIRS, + LETTER_FREQUENCY ) from fastapi import HTTPException, status from sklearn.metrics.pairwise import cosine_similarity @@ -121,24 +123,63 @@ def _suggest_commit_type(self, message: str) -> str: def _check_gibberish(self, word: str) -> bool: """ - Determines if a word is likely to be gibberish based on vowel content. + Determines if a word is likely to be gibberish using multiple linguistic patterns. - Criteria for identifying gibberish: - - Words shorter than 2 characters must contain at least one vowel - - Words 2 characters or longer must have a vowel ratio of at least 0.2 + The function employs four distinct checks to identify gibberish: + 1. Vowel ratio: Words must maintain a minimum vowel-to-length ratio of 0.2 + 2. Consonant sequences: Flags sequences of more than 4 consecutive consonants + 3. Letter frequency: For words >= 4 chars, compares letter frequencies against English language norms + 4. Consonant pairs: Identifies invalid consonant combinations that rarely occur in English + + A word is considered gibberish if it fails two or more of these checks. """ - cleaned_word = word.strip(string.punctuation) - if not cleaned_word: + VOWELS = set('aeiouyAEIOUY') + + word = word.lower().strip(string.punctuation) + if not word or len(word) < 2 or not word.isalpha(): return False - vowels = set("aeiouyAEIOUY") - vowel_count = sum(1 for char in cleaned_word if char in vowels) - - if len(cleaned_word) < 2: - return vowel_count == 0 - else: - vowel_ratio = vowel_count / len(cleaned_word) - return vowel_ratio < 0.2 + failed_checks = 0 + + vowel_count = sum(1 for c in word if c in VOWELS) + if vowel_count / len(word) < 0.2: + failed_checks += 1 + + consonant_sequence = 0 + for char in word: + if char not in VOWELS: + consonant_sequence += 1 + if consonant_sequence > 4: + failed_checks += 1 + break + else: + consonant_sequence = 0 + + if len(word) >= 4: + char_counts = {} + for char in word: + char_counts[char] = char_counts.get(char, 0) + 1 + + deviation = 0 + for char, count in char_counts.items(): + if char in LETTER_FREQUENCY: + expected = LETTER_FREQUENCY[char] / 100 + actual = count / len(word) + deviation += abs(expected - actual) + + if (deviation / len(char_counts)) > 0.5: + failed_checks += 1 + + invalid_pairs = 0 + for i in range(len(word) - 1): + pair = word[i:i+2] + if pair not in VALID_PAIRS and pair[0] not in VOWELS and pair[1] not in VOWELS: + invalid_pairs += 1 + if invalid_pairs > 1: + failed_checks += 1 + break + + return failed_checks >= 2 def _check_content_quality(self, message: str) -> list[CommitIssue]: """