Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/config/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,18 @@
("streamline", "delivery"),
],
}

VALID_PAIRS = {
"bl", "br", "ch", "cl", "cr", "dr", "fl", "fr", "gl", "gr",
"ph", "pl", "pr", "sc", "sh", "sk", "sl", "sm", "sn", "sp",
"st", "sw", "th", "tr", "tw", "wh", "wr"
}

LETTER_FREQUENCY = {
"e": 12.7, "t": 9.1, "a": 8.2, "o": 7.5, "i": 7.0,
"n": 6.7, "s": 6.3, "h": 6.1, "r": 6.0, "d": 4.3,
"l": 4.0, "c": 2.8, "u": 2.8, "m": 2.4, "w": 2.4,
"f": 2.2, "g": 2.0, "y": 2.0, "p": 1.9, "b": 1.5,
"v": 0.98, "k": 0.77, "j": 0.15, "x": 0.15,
"q": 0.095, "z": 0.074
}
73 changes: 57 additions & 16 deletions src/core/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
example_commits,
commit_training_data,
semantic_patterns,
VALID_PAIRS,
LETTER_FREQUENCY
)
from fastapi import HTTPException, status
from sklearn.metrics.pairwise import cosine_similarity
Expand Down Expand Up @@ -82,7 +84,7 @@ def _check_format(self, message: str) -> list[CommitIssue]:
CommitIssue(
severity="high",
message="Invalid commit type",
suggestion=f"Use '{likely_type}' for this kind of change\nExample: `{self.example_commits[likely_type]}`",
suggestion=f"Use '{likely_type}' for this kind of change\n└─ Example:\n• ```{self.example_commits[likely_type]}```",
)
]
return []
Expand Down Expand Up @@ -121,24 +123,63 @@ def _suggest_commit_type(self, message: str) -> str:

def _check_gibberish(self, word: str) -> bool:
"""
Determines if a word is likely to be gibberish based on vowel content.
Determines if a word is likely to be gibberish using multiple linguistic patterns.

Criteria for identifying gibberish:
- Words shorter than 2 characters must contain at least one vowel
- Words 2 characters or longer must have a vowel ratio of at least 0.2
The function employs four distinct checks to identify gibberish:
1. Vowel ratio: Words must maintain a minimum vowel-to-length ratio of 0.2
2. Consonant sequences: Flags sequences of more than 4 consecutive consonants
3. Letter frequency: For words >= 4 chars, compares letter frequencies against English language norms
4. Consonant pairs: Identifies invalid consonant combinations that rarely occur in English

A word is considered gibberish if it fails two or more of these checks.
"""
cleaned_word = word.strip(string.punctuation)
if not cleaned_word:
VOWELS = set('aeiouyAEIOUY')

word = word.lower().strip(string.punctuation)
if not word or len(word) < 2 or not word.isalpha():
return False

vowels = set("aeiouyAEIOUY")
vowel_count = sum(1 for char in cleaned_word if char in vowels)

if len(cleaned_word) < 2:
return vowel_count == 0
else:
vowel_ratio = vowel_count / len(cleaned_word)
return vowel_ratio < 0.2
failed_checks = 0

vowel_count = sum(1 for c in word if c in VOWELS)
if vowel_count / len(word) < 0.2:
failed_checks += 1

consonant_sequence = 0
for char in word:
if char not in VOWELS:
consonant_sequence += 1
if consonant_sequence > 4:
failed_checks += 1
break
else:
consonant_sequence = 0

if len(word) >= 4:
char_counts = {}
for char in word:
char_counts[char] = char_counts.get(char, 0) + 1

deviation = 0
for char, count in char_counts.items():
if char in LETTER_FREQUENCY:
expected = LETTER_FREQUENCY[char] / 100
actual = count / len(word)
deviation += abs(expected - actual)

if (deviation / len(char_counts)) > 0.5:
failed_checks += 1

invalid_pairs = 0
for i in range(len(word) - 1):
pair = word[i:i+2]
if pair not in VALID_PAIRS and pair[0] not in VOWELS and pair[1] not in VOWELS:
invalid_pairs += 1
if invalid_pairs > 1:
failed_checks += 1
break

return failed_checks >= 2

def _check_content_quality(self, message: str) -> list[CommitIssue]:
"""
Expand Down Expand Up @@ -232,4 +273,4 @@ def format_analysis(self, commit: dict, issues: list[CommitIssue]) -> str:
"└─ Git Best Practices: <https://git-scm.com/book/en/v2/Distributed-Git-Contributing-to-a-Project|Git Contributing>"
)

return f"{commit_details}{analysis_section}{suggestions}"
return f"{commit_details}{analysis_section}{suggestions}"