Skip to content

Commit 260d977

Browse files
authored
Merge pull request #414 from JohT/feature/git-normalized-co-change-count
Association rule metrics for files that changed together based on the git history
2 parents fbbdf8c + 2976072 commit 260d977

14 files changed

+1249
-174
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Contained within this repository is a comprehensive and automated code graph ana
2828

2929
### :newspaper: News
3030

31+
- August 2025: Association rule learning for co-changing files in git history
3132
- August 2025: Anomaly detection powered by unsupervised machine learning and explainable AI
3233
- May 2025: Migrated to [Neo4j 2025.x](https://neo4j.com/docs/upgrade-migration-guide/current/version-2025/upgrade) and Java 21.
3334

cypher/General_Enrichment/Add_file_name and_extension.cypher

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
// Add "name", "extension" and "extensionExtended" properties to File nodes
1+
// Add "name", "extension" and "extensionExtended" properties to File nodes. Supports Git:File nodes with "relativePath" property.
22

33
MATCH (file:File)
4-
WHERE file.fileName IS NOT NULL
4+
WHERE (file.fileName IS NOT NULL OR file.relativePath IS NOT NULL)
55
AND file.name IS NULL // Don't override an already existing "name" property
66
WITH *
7-
,file.fileName AS fileName
7+
,coalesce(file.fileName, file.relativePath) AS fileName
88
WITH *
99
,last(split(fileName, '/')) AS fileNameWithoutPath
1010
WITH *
Lines changed: 83 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,103 @@
11
// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
22

3-
MATCH (global_git_commit:Git:Commit)
4-
WITH count(global_git_commit) AS globalCommitCount
3+
// Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
4+
MATCH (git_commit_global:Git:Commit)-[:CONTAINS_CHANGE]->(:Git:Change)-[:UPDATES]->(git_file_global:Git:File)
5+
WHERE git_file_global.deletedAt IS NULL
6+
WITH git_commit_global, count(DISTINCT git_file_global) AS commitFileCount
7+
WITH percentileDisc(commitFileCount, 0.95) AS globalFileCountThreshold
8+
,count(git_commit_global) AS globalUpdateCommitCount
9+
// Main section
510
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
611
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
712
WHERE git_file.deletedAt IS NULL
813
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
914
ORDER BY git_commit.sha, git_file.relativePath
10-
WITH globalCommitCount
15+
WITH globalFileCountThreshold
16+
,globalUpdateCommitCount
1117
,git_commit.sha AS commitHash
1218
,collect(DISTINCT git_file) AS filesInCommit
1319
// Limit the file count to min. 2 (changed together) and
1420
// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
1521
WHERE size(filesInCommit) >= 2
16-
AND size(filesInCommit) <= 50
22+
AND size(filesInCommit) <= globalFileCountThreshold
1723
// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
18-
WITH globalCommitCount
24+
WITH globalFileCountThreshold
25+
,globalUpdateCommitCount
1926
,commitHash
2027
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
2128
UNWIND fileCombinations AS fileCombination
22-
WITH globalCommitCount
29+
WITH globalFileCountThreshold
30+
,globalUpdateCommitCount
2331
,fileCombination
24-
,count(DISTINCT commitHash) AS commitCount
25-
,collect(DISTINCT commitHash) AS commitHashes
26-
// Filter out file pairs that where changed not very often together
27-
// In detail: More than 0.1 per mille compared to overall commit count
28-
WHERE commitCount > globalCommitCount * 0.001
29-
WITH fileCombination[0] AS firstFile
32+
,count(DISTINCT commitHash) AS updateCommitCount
33+
,collect(DISTINCT commitHash) AS updateCommitHashes
34+
// Deactivated:
35+
// Filter out file pairs that weren't changed very often together
36+
WHERE updateCommitCount > 2
37+
WITH *
38+
,fileCombination[0] AS firstFile
3039
,fileCombination[1] AS secondFile
31-
,commitCount
32-
,commitHashes
33-
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
34-
CALL (firstFile, secondFile, commitCount, commitHashes) {
40+
WITH *
41+
// Get the lowest number of git update commits of both files (file pair)
42+
,CASE WHEN firstFile.updateCommitCount < secondFile.updateCommitCount
43+
THEN firstFile.updateCommitCount
44+
ELSE secondFile.updateCommitCount
45+
END AS minUpdateCommitCount
46+
// Calculate update commit support by dividing the update commit count by the overall commit count for both files
47+
,toFloat(firstFile.updateCommitCount) / globalUpdateCommitCount AS firstFileUpdateSupport
48+
,toFloat(secondFile.updateCommitCount) / globalUpdateCommitCount AS secondFileUpdateSupport
49+
WITH *
50+
// Expected likelihood that the first and the second file change together given complete randomness
51+
,firstFileUpdateSupport * secondFileUpdateSupport AS expectedCoUpdateSupport
52+
WITH firstFile
53+
,secondFile
54+
,updateCommitHashes
55+
,updateCommitCount
56+
// Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
57+
,toFloat(updateCommitCount) / minUpdateCommitCount AS updateCommitMinConfidence
58+
// Compared to all commits in general, how high is the percentage of the commits where both files changed together?
59+
,toFloat(updateCommitCount) / globalUpdateCommitCount AS updateCommitSupport
60+
// Lift
61+
,toFloat(updateCommitCount) / (globalUpdateCommitCount * expectedCoUpdateSupport) AS updateCommitLift
62+
// Jaccard Similarity: Of all commits involving either file, how many involved both?
63+
,toFloat(updateCommitCount) / (firstFile.updateCommitCount + secondFile.updateCommitCount - updateCommitCount) AS updateCommitJaccardSimilarity
64+
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it
65+
CALL (firstFile, secondFile, updateCommitCount, updateCommitHashes, updateCommitMinConfidence, updateCommitSupport, updateCommitLift, updateCommitJaccardSimilarity) {
3566
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
36-
SET pairwiseChange.commitCount = commitCount
37-
,pairwiseChange.commitHashes = commitHashes
38-
} IN TRANSACTIONS
67+
SET pairwiseChange.updateCommitCount = toInteger(updateCommitCount)
68+
,pairwiseChange.updateCommitHashes = updateCommitHashes
69+
,pairwiseChange.updateCommitMinConfidence = updateCommitMinConfidence
70+
,pairwiseChange.updateCommitSupport = updateCommitSupport
71+
,pairwiseChange.updateCommitLift = updateCommitLift
72+
,pairwiseChange.updateCommitJaccardSimilarity = updateCommitJaccardSimilarity
73+
} IN TRANSACTIONS OF 500 ROWS
3974
// Return one row with some statistics about the found pairs and their commit counts
40-
RETURN max(commitCount) AS maxCommitCount
41-
,avg(commitCount) AS avgCommitCount
42-
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
43-
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
44-
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
45-
,count(*) AS pairCount
75+
RETURN count(*) AS pairCount
76+
77+
,min(updateCommitCount) AS minCommitCount
78+
,max(updateCommitCount) AS maxCommitCount
79+
,avg(updateCommitCount) AS avgCommitCount
80+
,percentileDisc(updateCommitCount, 0.5) AS percentile50CommitCount
81+
,percentileDisc(updateCommitCount, 0.9) AS percentile90CommitCount
82+
,percentileDisc(updateCommitCount, 0.95) AS percentile95CommitCount
83+
84+
,min(updateCommitMinConfidence) AS minMinConfidence
85+
,max(updateCommitMinConfidence) AS maxMinConfidence
86+
,avg(updateCommitMinConfidence) AS avgMinConfidence
87+
,percentileDisc(updateCommitMinConfidence, 0.5) AS percentile50MinConfidence
88+
,percentileDisc(updateCommitMinConfidence, 0.9) AS percentile90MinConfidence
89+
,percentileDisc(updateCommitMinConfidence, 0.95) AS percentile95MinConfidence
90+
91+
,min(updateCommitLift) AS minLift
92+
,max(updateCommitLift) AS maxLift
93+
,avg(updateCommitLift) AS avgLift
94+
,percentileDisc(updateCommitLift, 0.5) AS percentile50Lift
95+
,percentileDisc(updateCommitLift, 0.9) AS percentile90Lift
96+
,percentileDisc(updateCommitLift, 0.95) AS percentile95Lift
97+
98+
,min(updateCommitJaccardSimilarity) AS minJaccardSimilarity
99+
,max(updateCommitJaccardSimilarity) AS maxJaccardSimilarity
100+
,avg(updateCommitJaccardSimilarity) AS avgJaccardSimilarity
101+
,percentileDisc(updateCommitJaccardSimilarity, 0.5) AS percentile50JaccardSimilarity
102+
,percentileDisc(updateCommitJaccardSimilarity, 0.9) AS percentile90JaccardSimilarity
103+
,percentileDisc(updateCommitJaccardSimilarity, 0.95) AS percentile95JaccardSimilarity

cypher/GitLog/List_git_files_that_were_changed_together.cypher

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
66
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile)
77
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile
88
,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile
9-
,gitChange.commitCount AS commitCount
9+
,gitChange.updateCommitCount AS commitCount
1010
ORDER BY commitCount DESC
Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
11
// List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
22

33
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
WHERE elementId(firstGitFile) < elementId(secondGitFile)
45
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
5-
UNWIND gitChange.commitHashes AS commitHash
6-
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath
6+
UNWIND gitChange.updateCommitHashes AS commitHash
7+
WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath
78
,count(DISTINCT commitHash) AS commitCount
9+
,sum(firstGitFile.updateCommitCount) AS fileUpdateCount
10+
,max(gitChange.updateCommitLift) AS maxLift
11+
,avg(gitChange.updateCommitLift) AS avgLift
12+
WITH *
13+
// Out of all the times the file was touched, how often did it co-occur with other files?
14+
,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate
15+
RETURN filePath, commitCount, coChangeRate, maxLift, avgLift
816
ORDER BY commitCount DESC
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// List pairs of files that were changed together. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
2+
3+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4+
WHERE elementId(firstFile) < elementId(secondFile)
5+
WITH *
6+
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
7+
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
8+
RETURN firstFileName
9+
,secondFileName
10+
,firstFile.name + '<br>' + secondFile.name AS filePairLineBreak
11+
,firstFileName + '<br>' + secondFileName AS filePairWithRelativePathLineBreak
12+
,firstFile.name + '↔' + secondFile.name AS filePair
13+
,firstFileName + '↔' + secondFileName AS filePairWithRelativePath
14+
,firstFile.extension AS firstFileExtension
15+
,secondFile.extension AS secondFileExtension
16+
,firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair
17+
,pairwiseChange.updateCommitCount AS updateCommitCount
18+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
19+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
20+
,pairwiseChange.updateCommitLift AS updateCommitLift
21+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Get the top 4 file extensions that where changed together most often and list top 20 pair that were changed together for each of the top file extension pair by their highest commit lift (>1: changes more often than by random chance). Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
2+
3+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4+
WHERE firstFile.extension < secondFile.extension
5+
OR (firstFile.extension = secondFile.extension AND elementId(firstFile) < elementId(secondFile))
6+
WITH firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair
7+
,count(DISTINCT pairwiseChange) AS pairCount
8+
ORDER BY pairCount DESC
9+
WITH collect(fileExtensionPair)[0..4] AS top4FileExtensionPairs
10+
UNWIND top4FileExtensionPairs AS fileExtensionPair
11+
CALL {
12+
WITH fileExtensionPair
13+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
14+
WHERE elementId(firstFile) < elementId(secondFile)
15+
AND firstFile.extension + '↔' + secondFile.extension = fileExtensionPair
16+
WITH *
17+
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
18+
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
19+
RETURN firstFile.name AS firstFileNameShort
20+
,secondFile.name AS secondFileNameShort
21+
,firstFileName
22+
,secondFileName
23+
,pairwiseChange[$selected_pair_metric] AS selectedMetric
24+
,pairwiseChange.updateCommitLift AS updateCommitLift
25+
,pairwiseChange.updateCommitCount AS updateCommitCount
26+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
27+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
28+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
29+
ORDER BY selectedMetric DESC, firstFileName ASC, secondFileName ASC
30+
LIMIT 20
31+
}
32+
RETURN fileExtensionPair
33+
,firstFileNameShort
34+
,secondFileNameShort
35+
,updateCommitCount
36+
,updateCommitMinConfidence
37+
,updateCommitLift
38+
,updateCommitJaccardSimilarity
39+
,updateCommitSupport
40+
,firstFileName
41+
,secondFileName
Lines changed: 16 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,24 @@
1-
// List pair of files that were changed together and that have a declared dependency between each other.
1+
// List pair of files that were changed together and that have a declared dependency between each other. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher and Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher to run first.
22

33
MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
44
MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
5-
//De-duplicating the pairs of files isn't necessary, because the dependency relation is directed.
6-
//WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
7-
WITH firstCodeFile.fileName AS firstFileName
8-
,secondCodeFile.fileName AS secondFileName
5+
WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
6+
WITH firstCodeFile.fileName AS firstFileName
7+
,secondCodeFile.fileName AS secondFileName
98
,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight
10-
,pairwiseChange.commitCount AS commitCount
11-
,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands
9+
,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistance
10+
,pairwiseChange.updateCommitCount AS commitCount
11+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
12+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
13+
,pairwiseChange.updateCommitLift AS updateCommitLift
14+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
1215
RETURN dependencyWeight
16+
,fileDistance
1317
,commitCount
14-
,fileDistanceAsFewestChangeDirectoryCommands
18+
,updateCommitMinConfidence
19+
,updateCommitSupport
20+
,updateCommitLift
21+
,updateCommitJaccardSimilarity
1522
// ,count(*) AS occurrences
1623
// ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples
17-
ORDER BY dependencyWeight, commitCount
18-
19-
// MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
20-
// MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
21-
// WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
22-
// RETURN firstCodeFile.fileName AS firstFileName
23-
// ,secondCodeFile.fileName AS secondFileName
24-
// ,dependency.weight AS dependencyWeight
25-
// ,pairwiseChange.commitCount AS commitCount
26-
// ORDER BY dependencyWeight, commitCount
27-
28-
// MATCH (g1:!Git&File)-[relation:CHANGED_TOGETHER_WITH|DEPENDS_ON]-(g2:!Git&File)
29-
// WITH count(DISTINCT relation) AS relatedFilesCount
30-
// ,collect(DISTINCT relation) AS relations
31-
// UNWIND relations AS relation
32-
// WITH relatedFilesCount
33-
// ,coalesce(relation.commitCount, 0) AS commitCount
34-
// ,coalesce(relation.weight, 0) AS dependencyWeight
35-
// ,coalesce(relation.fileDistanceAsFewestChangeDirectoryCommands, 0) AS fileDistanceAsFewestChangeDirectoryCommands
36-
// RETURN dependencyWeight
37-
// ,commitCount
38-
// ,fileDistanceAsFewestChangeDirectoryCommands
39-
// ORDER BY dependencyWeight, commitCount
24+
ORDER BY dependencyWeight, commitCount
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Set updateCommitCount property on Git File nodes when git commits with Update modifier (detected by the plugin) are present
2+
3+
MATCH (git_file:File&Git)<-[:UPDATES]-(:Git&Change)<-[:CONTAINS_CHANGE]-(git_commit:Git&Commit)
4+
WHERE git_file.deletedAt IS NULL
5+
WITH git_file, count(DISTINCT git_commit.sha) AS updateCommitCount
6+
SET git_file.updateCommitCount = updateCommitCount
7+
WITH git_file, updateCommitCount
8+
MATCH (code_file:File&!Git)<-[:RESOLVES_TO]-(git_file)
9+
SET code_file.updateCommitCount = updateCommitCount
10+
RETURN count(DISTINCT code_file) AS codeFileUpdates
11+
,collect(DISTINCT code_file.name)[0..4] AS codeFileExample
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
// Verify if CHANGED_TOGETHER_WITH properties from git are missing
2+
3+
MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
4+
RETURN (pairwiseChange.updateCommitCount IS NULL) AS updateCommitCountMissing
5+
,(pairwiseChange.updateCommitMinConfidence IS NULL) AS updateCommitMinConfidenceMissing
6+
,count(*)

0 commit comments

Comments
 (0)