11// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
22
3- MATCH (global_git_commit :Git :Commit )
4- WITH count (global_git_commit ) AS globalCommitCount
3+ // Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
4+ MATCH (git_commit_global :Git :Commit )- [ : CONTAINS_CHANGE ] -> (:Git :Change )- [ : UPDATES ] -> (git_file_global :Git :File )
5+ WHERE git_file_global .deletedAt IS NULL
6+ WITH git_commit_global , count (DISTINCT git_file_global ) AS commitFileCount
7+ WITH percentileDisc (commitFileCount , 0.95 ) AS globalFileCountThreshold
8+ ,count (git_commit_global ) AS globalUpdateCommitCount
9+ // Main section
510MATCH (git_commit :Git :Commit )- [ : CONTAINS_CHANGE ] -> (git_change :Git :Change )- [ : UPDATES ] -> (git_file :Git :File )
611MATCH (git_repository :Git &Repository )- [ : HAS_FILE ] -> (git_file )
712WHERE git_file .deletedAt IS NULL
813// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
914ORDER BY git_commit .sha , git_file .relativePath
10- WITH globalCommitCount
15+ WITH globalFileCountThreshold
16+ ,globalUpdateCommitCount
1117 ,git_commit .sha AS commitHash
1218 ,collect (DISTINCT git_file ) AS filesInCommit
1319// Limit the file count to min. 2 (changed together) and
1420// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
1521WHERE size (filesInCommit ) >= 2
16- AND size (filesInCommit ) <= 50
22+ AND size (filesInCommit ) <= globalFileCountThreshold
1723// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
18- WITH globalCommitCount
24+ WITH globalFileCountThreshold
25+ ,globalUpdateCommitCount
1926 ,commitHash
2027 ,apoc .coll .combinations (filesInCommit , 2 , 2 ) AS fileCombinations
2128UNWIND fileCombinations AS fileCombination
22- WITH globalCommitCount
29+ WITH globalFileCountThreshold
30+ ,globalUpdateCommitCount
2331 ,fileCombination
24- ,count (DISTINCT commitHash ) AS commitCount
25- ,collect (DISTINCT commitHash ) AS commitHashes
26- // Filter out file pairs that where changed not very often together
27- // In detail: More than 0.1 per mille compared to overall commit count
28- WHERE commitCount > globalCommitCount * 0.001
29- WITH fileCombination [0 ] AS firstFile
32+ ,count (DISTINCT commitHash ) AS updateCommitCount
33+ ,collect (DISTINCT commitHash ) AS updateCommitHashes
34+ // Deactivated:
35+ // Filter out file pairs that weren't changed very often together
36+ WHERE updateCommitCount > 2
37+ WITH *
38+ ,fileCombination [0 ] AS firstFile
3039 ,fileCombination [1 ] AS secondFile
31- ,commitCount
32- ,commitHashes
33- // Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
34- CALL (firstFile , secondFile , commitCount , commitHashes ) {
40+ WITH *
41+ // Get the lowest number of git update commits of both files (file pair)
42+ ,CASE WHEN firstFile .updateCommitCount < secondFile .updateCommitCount
43+ THEN firstFile .updateCommitCount
44+ ELSE secondFile .updateCommitCount
45+ END AS minUpdateCommitCount
46+ // Calculate update commit support by dividing the update commit count by the overall commit count for both files
47+ ,toFloat (firstFile .updateCommitCount ) / globalUpdateCommitCount AS firstFileUpdateSupport
48+ ,toFloat (secondFile .updateCommitCount ) / globalUpdateCommitCount AS secondFileUpdateSupport
49+ WITH *
50+ // Expected likelihood that the first and the second file change together given complete randomness
51+ ,firstFileUpdateSupport * secondFileUpdateSupport AS expectedCoUpdateSupport
52+ WITH firstFile
53+ ,secondFile
54+ ,updateCommitHashes
55+ ,updateCommitCount
56+ // Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
57+ ,toFloat (updateCommitCount ) / minUpdateCommitCount AS updateCommitMinConfidence
58+ // Compared to all commits in general, how high is the percentage of the commits where both files changed together?
59+ ,toFloat (updateCommitCount ) / globalUpdateCommitCount AS updateCommitSupport
60+ // Lift
61+ ,toFloat (updateCommitCount ) / (globalUpdateCommitCount * expectedCoUpdateSupport ) AS updateCommitLift
62+ // Jaccard Similarity: Of all commits involving either file, how many involved both?
63+ ,toFloat (updateCommitCount ) / (firstFile .updateCommitCount + secondFile .updateCommitCount - updateCommitCount ) AS updateCommitJaccardSimilarity
64+ // Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it
65+ CALL (firstFile , secondFile , updateCommitCount , updateCommitHashes , updateCommitMinConfidence , updateCommitSupport , updateCommitLift , updateCommitJaccardSimilarity ) {
3566 MERGE (firstFile )- [pairwiseChange : CHANGED_TOGETHER_WITH ]- (secondFile )
36- SET pairwiseChange .commitCount = commitCount
37- , pairwiseChange .commitHashes = commitHashes
38- } IN TRANSACTIONS
67+ SET pairwiseChange .updateCommitCount = toInteger (updateCommitCount )
68+ , pairwiseChange .updateCommitHashes = updateCommitHashes
69+ , pairwiseChange .updateCommitMinConfidence = updateCommitMinConfidence
70+ , pairwiseChange .updateCommitSupport = updateCommitSupport
71+ , pairwiseChange .updateCommitLift = updateCommitLift
72+ , pairwiseChange .updateCommitJaccardSimilarity = updateCommitJaccardSimilarity
73+ } IN TRANSACTIONS OF 500 ROWS
3974// Return one row with some statistics about the found pairs and their commit counts
40- RETURN max (commitCount ) AS maxCommitCount
41- ,avg (commitCount ) AS avgCommitCount
42- ,percentileDisc (commitCount , 0.5 ) AS percentile50CommitCount
43- ,percentileDisc (commitCount , 0.9 ) AS percentile90CommitCount
44- ,percentileDisc (commitCount , 0.95 ) AS percentile95CommitCount
45- ,count (* ) AS pairCount
75+ RETURN count (* ) AS pairCount
76+
77+ ,min (updateCommitCount ) AS minCommitCount
78+ ,max (updateCommitCount ) AS maxCommitCount
79+ ,avg (updateCommitCount ) AS avgCommitCount
80+ ,percentileDisc (updateCommitCount , 0.5 ) AS percentile50CommitCount
81+ ,percentileDisc (updateCommitCount , 0.9 ) AS percentile90CommitCount
82+ ,percentileDisc (updateCommitCount , 0.95 ) AS percentile95CommitCount
83+
84+ ,min (updateCommitMinConfidence ) AS minMinConfidence
85+ ,max (updateCommitMinConfidence ) AS maxMinConfidence
86+ ,avg (updateCommitMinConfidence ) AS avgMinConfidence
87+ ,percentileDisc (updateCommitMinConfidence , 0.5 ) AS percentile50MinConfidence
88+ ,percentileDisc (updateCommitMinConfidence , 0.9 ) AS percentile90MinConfidence
89+ ,percentileDisc (updateCommitMinConfidence , 0.95 ) AS percentile95MinConfidence
90+
91+ ,min (updateCommitLift ) AS minLift
92+ ,max (updateCommitLift ) AS maxLift
93+ ,avg (updateCommitLift ) AS avgLift
94+ ,percentileDisc (updateCommitLift , 0.5 ) AS percentile50Lift
95+ ,percentileDisc (updateCommitLift , 0.9 ) AS percentile90Lift
96+ ,percentileDisc (updateCommitLift , 0.95 ) AS percentile95Lift
97+
98+ ,min (updateCommitJaccardSimilarity ) AS minJaccardSimilarity
99+ ,max (updateCommitJaccardSimilarity ) AS maxJaccardSimilarity
100+ ,avg (updateCommitJaccardSimilarity ) AS avgJaccardSimilarity
101+ ,percentileDisc (updateCommitJaccardSimilarity , 0.5 ) AS percentile50JaccardSimilarity
102+ ,percentileDisc (updateCommitJaccardSimilarity , 0.9 ) AS percentile90JaccardSimilarity
103+ ,percentileDisc (updateCommitJaccardSimilarity , 0.95 ) AS percentile95JaccardSimilarity
0 commit comments