Skip to content
15 changes: 15 additions & 0 deletions src/config/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,18 @@
("streamline", "delivery"),
],
}

VALID_PAIRS = {
"bl", "br", "ch", "cl", "cr", "dr", "fl", "fr", "gl", "gr",
"ph", "pl", "pr", "sc", "sh", "sk", "sl", "sm", "sn", "sp",
"st", "sw", "th", "tr", "tw", "wh", "wr"
}

LETTER_FREQUENCY = {
"e": 12.7, "t": 9.1, "a": 8.2, "o": 7.5, "i": 7.0,
"n": 6.7, "s": 6.3, "h": 6.1, "r": 6.0, "d": 4.3,
"l": 4.0, "c": 2.8, "u": 2.8, "m": 2.4, "w": 2.4,
"f": 2.2, "g": 2.0, "y": 2.0, "p": 1.9, "b": 1.5,
"v": 0.98, "k": 0.77, "j": 0.15, "x": 0.15,
"q": 0.095, "z": 0.074
}
73 changes: 57 additions & 16 deletions src/core/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
example_commits,
commit_training_data,
semantic_patterns,
VALID_PAIRS,
LETTER_FREQUENCY
)
from fastapi import HTTPException, status
from sklearn.metrics.pairwise import cosine_similarity
Expand Down Expand Up @@ -82,7 +84,7 @@ def _check_format(self, message: str) -> list[CommitIssue]:
CommitIssue(
severity="high",
message="Invalid commit type",
suggestion=f"Use '{likely_type}' for this kind of change\nExample: `{self.example_commits[likely_type]}`",
suggestion=f"Use '{likely_type}' for this kind of change\n└─ Example:\n• ```{self.example_commits[likely_type]}```",
)
]
return []
Expand Down Expand Up @@ -121,24 +123,63 @@ def _suggest_commit_type(self, message: str) -> str:

def _check_gibberish(self, word: str) -> bool:
"""
Determines if a word is likely to be gibberish based on vowel content.
Determines if a word is likely to be gibberish using multiple linguistic patterns.

Criteria for identifying gibberish:
- Words shorter than 2 characters must contain at least one vowel
- Words 2 characters or longer must have a vowel ratio of at least 0.2
The function employs four distinct checks to identify gibberish:
1. Vowel ratio: Words must maintain a minimum vowel-to-length ratio of 0.2
2. Consonant sequences: Flags sequences of more than 4 consecutive consonants
3. Letter frequency: For words >= 4 chars, compares letter frequencies against English language norms
4. Consonant pairs: Identifies invalid consonant combinations that rarely occur in English

A word is considered gibberish if it fails two or more of these checks.
"""
cleaned_word = word.strip(string.punctuation)
if not cleaned_word:
VOWELS = set('aeiouyAEIOUY')

word = word.lower().strip(string.punctuation)
if not word or len(word) < 2 or not word.isalpha():
return False

vowels = set("aeiouyAEIOUY")
vowel_count = sum(1 for char in cleaned_word if char in vowels)

if len(cleaned_word) < 2:
return vowel_count == 0
else:
vowel_ratio = vowel_count / len(cleaned_word)
return vowel_ratio < 0.2
failed_checks = 0

vowel_count = sum(1 for c in word if c in VOWELS)
if vowel_count / len(word) < 0.2:
failed_checks += 1

consonant_sequence = 0
for char in word:
if char not in VOWELS:
consonant_sequence += 1
if consonant_sequence > 4:
failed_checks += 1
break
else:
consonant_sequence = 0

if len(word) >= 4:
char_counts = {}
for char in word:
char_counts[char] = char_counts.get(char, 0) + 1

deviation = 0
for char, count in char_counts.items():
if char in LETTER_FREQUENCY:
expected = LETTER_FREQUENCY[char] / 100
actual = count / len(word)
deviation += abs(expected - actual)

if (deviation / len(char_counts)) > 0.5:
failed_checks += 1

invalid_pairs = 0
for i in range(len(word) - 1):
pair = word[i:i+2]
if pair not in VALID_PAIRS and pair[0] not in VOWELS and pair[1] not in VOWELS:
invalid_pairs += 1
if invalid_pairs > 1:
failed_checks += 1
break

return failed_checks >= 2

def _check_content_quality(self, message: str) -> list[CommitIssue]:
"""
Expand Down Expand Up @@ -232,4 +273,4 @@ def format_analysis(self, commit: dict, issues: list[CommitIssue]) -> str:
"└─ Git Best Practices: <https://git-scm.com/book/en/v2/Distributed-Git-Contributing-to-a-Project|Git Contributing>"
)

return f"{commit_details}{analysis_section}{suggestions}"
return f"{commit_details}{analysis_section}{suggestions}"