|
4 | 4 | example_commits, |
5 | 5 | commit_training_data, |
6 | 6 | semantic_patterns, |
| 7 | + VALID_PAIRS, |
| 8 | + LETTER_FREQUENCY |
7 | 9 | ) |
8 | 10 | from fastapi import HTTPException, status |
9 | 11 | from sklearn.metrics.pairwise import cosine_similarity |
@@ -121,24 +123,63 @@ def _suggest_commit_type(self, message: str) -> str: |
121 | 123 |
|
122 | 124 | def _check_gibberish(self, word: str) -> bool: |
123 | 125 | """ |
124 | | - Determines if a word is likely to be gibberish based on vowel content. |
| 126 | + Determines if a word is likely to be gibberish using multiple linguistic patterns. |
125 | 127 | |
126 | | - Criteria for identifying gibberish: |
127 | | - - Words shorter than 2 characters must contain at least one vowel |
128 | | - - Words 2 characters or longer must have a vowel ratio of at least 0.2 |
| 128 | + The function employs four distinct checks to identify gibberish: |
| 129 | + 1. Vowel ratio: Words must maintain a minimum vowel-to-length ratio of 0.2 |
| 130 | + 2. Consonant sequences: Flags sequences of more than 4 consecutive consonants |
| 131 | + 3. Letter frequency: For words >= 4 chars, compares letter frequencies against English language norms |
| 132 | + 4. Consonant pairs: Identifies invalid consonant combinations that rarely occur in English |
| 133 | + |
| 134 | + A word is considered gibberish if it fails two or more of these checks. |
129 | 135 | """ |
130 | | - cleaned_word = word.strip(string.punctuation) |
131 | | - if not cleaned_word: |
| 136 | + VOWELS = set('aeiouyAEIOUY') |
| 137 | + |
| 138 | + word = word.lower().strip(string.punctuation) |
| 139 | + if not word or len(word) < 2 or not word.isalpha(): |
132 | 140 | return False |
133 | 141 |
|
134 | | - vowels = set("aeiouyAEIOUY") |
135 | | - vowel_count = sum(1 for char in cleaned_word if char in vowels) |
136 | | - |
137 | | - if len(cleaned_word) < 2: |
138 | | - return vowel_count == 0 |
139 | | - else: |
140 | | - vowel_ratio = vowel_count / len(cleaned_word) |
141 | | - return vowel_ratio < 0.2 |
| 142 | + failed_checks = 0 |
| 143 | + |
| 144 | + vowel_count = sum(1 for c in word if c in VOWELS) |
| 145 | + if vowel_count / len(word) < 0.2: |
| 146 | + failed_checks += 1 |
| 147 | + |
| 148 | + consonant_sequence = 0 |
| 149 | + for char in word: |
| 150 | + if char not in VOWELS: |
| 151 | + consonant_sequence += 1 |
| 152 | + if consonant_sequence > 4: |
| 153 | + failed_checks += 1 |
| 154 | + break |
| 155 | + else: |
| 156 | + consonant_sequence = 0 |
| 157 | + |
| 158 | + if len(word) >= 4: |
| 159 | + char_counts = {} |
| 160 | + for char in word: |
| 161 | + char_counts[char] = char_counts.get(char, 0) + 1 |
| 162 | + |
| 163 | + deviation = 0 |
| 164 | + for char, count in char_counts.items(): |
| 165 | + if char in LETTER_FREQUENCY: |
| 166 | + expected = LETTER_FREQUENCY[char] / 100 |
| 167 | + actual = count / len(word) |
| 168 | + deviation += abs(expected - actual) |
| 169 | + |
| 170 | + if (deviation / len(char_counts)) > 0.5: |
| 171 | + failed_checks += 1 |
| 172 | + |
| 173 | + invalid_pairs = 0 |
| 174 | + for i in range(len(word) - 1): |
| 175 | + pair = word[i:i+2] |
| 176 | + if pair not in VALID_PAIRS and pair[0] not in VOWELS and pair[1] not in VOWELS: |
| 177 | + invalid_pairs += 1 |
| 178 | + if invalid_pairs > 1: |
| 179 | + failed_checks += 1 |
| 180 | + break |
| 181 | + |
| 182 | + return failed_checks >= 2 |
142 | 183 |
|
143 | 184 | def _check_content_quality(self, message: str) -> list[CommitIssue]: |
144 | 185 | """ |
|
0 commit comments