Skip to content

Commit 356196e

Browse files
committed
refactor(analyzer): better gibberish detection and cleanup
- Added improved gibberish detection method - Properly documented all 4 detection processes
1 parent 040e446 commit 356196e

File tree

2 files changed

+70
-14
lines changed

2 files changed

+70
-14
lines changed

src/config/data.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,3 +395,18 @@
395395
("streamline", "delivery"),
396396
],
397397
}
398+
399+
VALID_PAIRS = {
400+
"bl", "br", "ch", "cl", "cr", "dr", "fl", "fr", "gl", "gr",
401+
"ph", "pl", "pr", "sc", "sh", "sk", "sl", "sm", "sn", "sp",
402+
"st", "sw", "th", "tr", "tw", "wh", "wr"
403+
}
404+
405+
LETTER_FREQUENCY = {
406+
"e": 12.7, "t": 9.1, "a": 8.2, "o": 7.5, "i": 7.0,
407+
"n": 6.7, "s": 6.3, "h": 6.1, "r": 6.0, "d": 4.3,
408+
"l": 4.0, "c": 2.8, "u": 2.8, "m": 2.4, "w": 2.4,
409+
"f": 2.2, "g": 2.0, "y": 2.0, "p": 1.9, "b": 1.5,
410+
"v": 0.98, "k": 0.77, "j": 0.15, "x": 0.15,
411+
"q": 0.095, "z": 0.074
412+
}

src/core/analyzer.py

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
example_commits,
55
commit_training_data,
66
semantic_patterns,
7+
VALID_PAIRS,
8+
LETTER_FREQUENCY
79
)
810
from fastapi import HTTPException, status
911
from sklearn.metrics.pairwise import cosine_similarity
@@ -121,24 +123,63 @@ def _suggest_commit_type(self, message: str) -> str:
121123

122124
def _check_gibberish(self, word: str) -> bool:
123125
"""
124-
Determines if a word is likely to be gibberish based on vowel content.
126+
Determines if a word is likely to be gibberish using multiple linguistic patterns.
125127
126-
Criteria for identifying gibberish:
127-
- Words shorter than 2 characters must contain at least one vowel
128-
- Words 2 characters or longer must have a vowel ratio of at least 0.2
128+
The function employs four distinct checks to identify gibberish:
129+
1. Vowel ratio: Words must maintain a minimum vowel-to-length ratio of 0.2
130+
2. Consonant sequences: Flags sequences of more than 4 consecutive consonants
131+
3. Letter frequency: For words >= 4 chars, compares letter frequencies against English language norms
132+
4. Consonant pairs: Identifies invalid consonant combinations that rarely occur in English
133+
134+
A word is considered gibberish if it fails two or more of these checks.
129135
"""
130-
cleaned_word = word.strip(string.punctuation)
131-
if not cleaned_word:
136+
VOWELS = set('aeiouyAEIOUY')
137+
138+
word = word.lower().strip(string.punctuation)
139+
if not word or len(word) < 2 or not word.isalpha():
132140
return False
133141

134-
vowels = set("aeiouyAEIOUY")
135-
vowel_count = sum(1 for char in cleaned_word if char in vowels)
136-
137-
if len(cleaned_word) < 2:
138-
return vowel_count == 0
139-
else:
140-
vowel_ratio = vowel_count / len(cleaned_word)
141-
return vowel_ratio < 0.2
142+
failed_checks = 0
143+
144+
vowel_count = sum(1 for c in word if c in VOWELS)
145+
if vowel_count / len(word) < 0.2:
146+
failed_checks += 1
147+
148+
consonant_sequence = 0
149+
for char in word:
150+
if char not in VOWELS:
151+
consonant_sequence += 1
152+
if consonant_sequence > 4:
153+
failed_checks += 1
154+
break
155+
else:
156+
consonant_sequence = 0
157+
158+
if len(word) >= 4:
159+
char_counts = {}
160+
for char in word:
161+
char_counts[char] = char_counts.get(char, 0) + 1
162+
163+
deviation = 0
164+
for char, count in char_counts.items():
165+
if char in LETTER_FREQUENCY:
166+
expected = LETTER_FREQUENCY[char] / 100
167+
actual = count / len(word)
168+
deviation += abs(expected - actual)
169+
170+
if (deviation / len(char_counts)) > 0.5:
171+
failed_checks += 1
172+
173+
invalid_pairs = 0
174+
for i in range(len(word) - 1):
175+
pair = word[i:i+2]
176+
if pair not in VALID_PAIRS and pair[0] not in VOWELS and pair[1] not in VOWELS:
177+
invalid_pairs += 1
178+
if invalid_pairs > 1:
179+
failed_checks += 1
180+
break
181+
182+
return failed_checks >= 2
142183

143184
def _check_content_quality(self, message: str) -> list[CommitIssue]:
144185
"""

0 commit comments

Comments
 (0)