44 example_commits ,
55 commit_training_data ,
66 semantic_patterns ,
7+ VALID_PAIRS ,
8+ LETTER_FREQUENCY
79)
810from fastapi import HTTPException , status
911from sklearn .metrics .pairwise import cosine_similarity
@@ -82,7 +84,7 @@ def _check_format(self, message: str) -> list[CommitIssue]:
8284 CommitIssue (
8385 severity = "high" ,
8486 message = "Invalid commit type" ,
85- suggestion = f"Use '{ likely_type } ' for this kind of change\n Example: ` { self .example_commits [likely_type ]} `" ,
87+ suggestion = f"Use '{ likely_type } ' for this kind of change\n └─ Example: \n • ``` { self .example_commits [likely_type ]} `` `" ,
8688 )
8789 ]
8890 return []
@@ -121,24 +123,63 @@ def _suggest_commit_type(self, message: str) -> str:
121123
122124 def _check_gibberish (self , word : str ) -> bool :
123125 """
124- Determines if a word is likely to be gibberish based on vowel content .
126+ Determines if a word is likely to be gibberish using multiple linguistic patterns .
125127
126- Criteria for identifying gibberish:
127- - Words shorter than 2 characters must contain at least one vowel
128- - Words 2 characters or longer must have a vowel ratio of at least 0.2
128+ The function employs four distinct checks to identify gibberish:
129+ 1. Vowel ratio: Words must maintain a minimum vowel-to-length ratio of 0.2
130+ 2. Consonant sequences: Flags sequences of more than 4 consecutive consonants
131+ 3. Letter frequency: For words >= 4 chars, compares letter frequencies against English language norms
132+ 4. Consonant pairs: Identifies invalid consonant combinations that rarely occur in English
133+
134+ A word is considered gibberish if it fails two or more of these checks.
129135 """
130- cleaned_word = word .strip (string .punctuation )
131- if not cleaned_word :
136+ VOWELS = set ('aeiouyAEIOUY' )
137+
138+ word = word .lower ().strip (string .punctuation )
139+ if not word or len (word ) < 2 or not word .isalpha ():
132140 return False
133141
134- vowels = set ("aeiouyAEIOUY" )
135- vowel_count = sum (1 for char in cleaned_word if char in vowels )
136-
137- if len (cleaned_word ) < 2 :
138- return vowel_count == 0
139- else :
140- vowel_ratio = vowel_count / len (cleaned_word )
141- return vowel_ratio < 0.2
142+ failed_checks = 0
143+
144+ vowel_count = sum (1 for c in word if c in VOWELS )
145+ if vowel_count / len (word ) < 0.2 :
146+ failed_checks += 1
147+
148+ consonant_sequence = 0
149+ for char in word :
150+ if char not in VOWELS :
151+ consonant_sequence += 1
152+ if consonant_sequence > 4 :
153+ failed_checks += 1
154+ break
155+ else :
156+ consonant_sequence = 0
157+
158+ if len (word ) >= 4 :
159+ char_counts = {}
160+ for char in word :
161+ char_counts [char ] = char_counts .get (char , 0 ) + 1
162+
163+ deviation = 0
164+ for char , count in char_counts .items ():
165+ if char in LETTER_FREQUENCY :
166+ expected = LETTER_FREQUENCY [char ] / 100
167+ actual = count / len (word )
168+ deviation += abs (expected - actual )
169+
170+ if (deviation / len (char_counts )) > 0.5 :
171+ failed_checks += 1
172+
173+ invalid_pairs = 0
174+ for i in range (len (word ) - 1 ):
175+ pair = word [i :i + 2 ]
176+ if pair not in VALID_PAIRS and pair [0 ] not in VOWELS and pair [1 ] not in VOWELS :
177+ invalid_pairs += 1
178+ if invalid_pairs > 1 :
179+ failed_checks += 1
180+ break
181+
182+ return failed_checks >= 2
142183
143184 def _check_content_quality (self , message : str ) -> list [CommitIssue ]:
144185 """
@@ -232,4 +273,4 @@ def format_analysis(self, commit: dict, issues: list[CommitIssue]) -> str:
232273 "└─ Git Best Practices: <https://git-scm.com/book/en/v2/Distributed-Git-Contributing-to-a-Project|Git Contributing>"
233274 )
234275
235- return f"{ commit_details } { analysis_section } { suggestions } "
276+ return f"{ commit_details } { analysis_section } { suggestions } "
0 commit comments