6
6
7
7
import org .togetherjava .tjbot .config .Config ;
8
8
import org .togetherjava .tjbot .config .ScamBlockerConfig ;
9
- import org .togetherjava .tjbot .features .utils .StringDistances ;
10
9
11
- import java .net .URI ;
12
10
import java .util .Collection ;
13
11
import java .util .List ;
14
- import java .util .Locale ;
15
- import java .util .StringJoiner ;
16
12
import java .util .function .Predicate ;
17
13
import java .util .regex .Pattern ;
18
14
import java .util .stream .Stream ;
@@ -28,6 +24,7 @@ public final class ScamDetector {
28
24
private final ScamBlockerConfig config ;
29
25
private final Predicate <String > isSuspiciousAttachmentName ;
30
26
private final Predicate <String > hasTrustedRole ;
27
+ private final TokenAnalyse tokenAnalyse ;
31
28
32
29
/**
33
30
* Creates a new instance with the given configuration
@@ -42,6 +39,8 @@ public ScamDetector(Config config) {
42
39
.asMatchPredicate ();
43
40
hasTrustedRole =
44
41
Pattern .compile (this .config .getTrustedUserRolePattern ()).asMatchPredicate ();
42
+
43
+ tokenAnalyse = new TokenAnalyse (this .config );
45
44
}
46
45
47
46
/**
@@ -59,10 +58,11 @@ public boolean isScam(Message message) {
59
58
}
60
59
61
60
String content = message .getContentDisplay ();
62
- List <Message .Attachment > attachments = message .getAttachments ();
61
+ List <Attachment > attachments =
62
+ message .getAttachments ().stream ().map (Attachment ::fromDiscord ).toList ();
63
63
64
64
if (content .isBlank ()) {
65
- return areAttachmentsSuspicious (attachments );
65
+ return areAttachmentsScam (attachments );
66
66
}
67
67
68
68
return isScam (content );
@@ -76,158 +76,36 @@ public boolean isScam(Message message) {
76
76
*/
77
77
public boolean isScam (CharSequence message ) {
78
78
AnalyseResults results = new AnalyseResults ();
79
- TOKENIZER .splitAsStream (message ).forEach (token -> analyzeToken (token , results ));
79
+ TOKENIZER .splitAsStream (message ).forEach (token -> tokenAnalyse . analyze (token , results ));
80
80
return isScam (results );
81
81
}
82
82
83
83
private boolean isScam (AnalyseResults results ) {
84
- if (results .pingsEveryone && (results .containsSuspiciousKeyword || results .hasUrl
85
- || results .containsDollarSign )) {
84
+ if (results .pingsEveryone () && (results .containsSuspiciousKeyword () || results .hasUrl ()
85
+ || results .containsDollarSign () )) {
86
86
return true ;
87
87
}
88
88
89
- return Stream
90
- .of (results .containsSuspiciousKeyword , results .hasSuspiciousUrl ,
91
- results .containsDollarSign )
89
+ boolean hasTooManySuspiciousFlags = Stream
90
+ .of (results .containsSuspiciousKeyword () , results .hasSuspiciousUrl () ,
91
+ results .containsDollarSign () )
92
92
.filter (flag -> flag )
93
93
.count () >= 2 ;
94
- }
95
-
96
- private void analyzeToken (String token , AnalyseResults results ) {
97
- if (token .isBlank ()) {
98
- return ;
99
- }
100
-
101
- if (!results .pingsEveryone
102
- && ("@everyone" .equalsIgnoreCase (token ) || "@here" .equalsIgnoreCase (token ))) {
103
- results .pingsEveryone = true ;
104
- }
105
-
106
- if (!results .containsSuspiciousKeyword && containsSuspiciousKeyword (token )) {
107
- results .containsSuspiciousKeyword = true ;
108
- }
109
-
110
- if (!results .containsDollarSign && (token .contains ("$" ) || "usd" .equalsIgnoreCase (token ))) {
111
- results .containsDollarSign = true ;
112
- }
113
-
114
- if (token .startsWith ("http" )) {
115
- analyzeUrl (token , results );
116
- }
117
- }
118
-
119
- private void analyzeUrl (String url , AnalyseResults results ) {
120
- String host ;
121
- try {
122
- host = URI .create (url ).getHost ();
123
- } catch (IllegalArgumentException _) {
124
- // Invalid urls are not scam
125
- return ;
126
- }
127
-
128
- if (host == null ) {
129
- return ;
130
- }
131
-
132
- results .hasUrl = true ;
133
-
134
- if (config .getHostWhitelist ().contains (host )) {
135
- return ;
136
- }
137
-
138
- if (config .getHostBlacklist ().contains (host )) {
139
- results .hasSuspiciousUrl = true ;
140
- return ;
141
- }
142
-
143
- for (String keyword : config .getSuspiciousHostKeywords ()) {
144
- if (isHostSimilarToKeyword (host , keyword )) {
145
- results .hasSuspiciousUrl = true ;
146
- break ;
147
- }
94
+ if (hasTooManySuspiciousFlags ) {
95
+ return true ;
148
96
}
149
- }
150
97
151
- private boolean containsSuspiciousKeyword (String token ) {
152
- String preparedToken = token .toLowerCase (Locale .US );
153
-
154
- return config .getSuspiciousKeywords ()
155
- .stream ()
156
- .map (keyword -> keyword .toLowerCase (Locale .US ))
157
- .anyMatch (keyword -> {
158
- // Exact match "^foo$"
159
- if (startsWith (keyword , '^' ) && endsWith (keyword , '$' )) {
160
- return preparedToken .equals (keyword .substring (1 , keyword .length () - 1 ));
161
- }
162
- // Simple regex-inspired syntax "^foo"
163
- if (startsWith (keyword , '^' )) {
164
- return preparedToken .startsWith (keyword .substring (1 ));
165
- }
166
- // Simple regex-inspired syntax "foo$"
167
- if (endsWith (keyword , '$' )) {
168
- return preparedToken .endsWith (keyword .substring (0 , keyword .length () - 1 ));
169
- }
170
- return preparedToken .contains (keyword );
171
- });
98
+ return results .onlyContainsUrls () && results .areAllUrlsWithAttachments ()
99
+ && areAttachmentsScam (results .getUrlAttachments ());
172
100
}
173
101
174
- private boolean areAttachmentsSuspicious (Collection <? extends Message . Attachment > attachments ) {
102
+ private boolean areAttachmentsScam (Collection <Attachment > attachments ) {
175
103
long suspiciousAttachments =
176
104
attachments .stream ().filter (this ::isAttachmentSuspicious ).count ();
177
105
return suspiciousAttachments >= config .getSuspiciousAttachmentsThreshold ();
178
106
}
179
107
180
- private boolean isAttachmentSuspicious (Message .Attachment attachment ) {
181
- return attachment .isImage () && isSuspiciousAttachmentName .test (attachment .getFileName ());
182
- }
183
-
184
- private boolean isHostSimilarToKeyword (String host , String keyword ) {
185
- // NOTE This algorithm is far from optimal.
186
- // It is good enough for our purpose though and not that complex.
187
-
188
- // Rolling window of keyword-size over host.
189
- // If any window has a small distance, it is similar
190
- int windowStart = 0 ;
191
- int windowEnd = keyword .length ();
192
- while (windowEnd <= host .length ()) {
193
- String window = host .substring (windowStart , windowEnd );
194
- int distance = StringDistances .editDistance (keyword , window );
195
-
196
- if (distance <= config .getIsHostSimilarToKeywordDistanceThreshold ()) {
197
- return true ;
198
- }
199
-
200
- windowStart ++;
201
- windowEnd ++;
202
- }
203
-
204
- return false ;
205
- }
206
-
207
- private static boolean startsWith (CharSequence text , char prefixToTest ) {
208
- return !text .isEmpty () && text .charAt (0 ) == prefixToTest ;
209
- }
210
-
211
- private static boolean endsWith (CharSequence text , char suffixToTest ) {
212
- return !text .isEmpty () && text .charAt (text .length () - 1 ) == suffixToTest ;
213
- }
214
-
215
- private static class AnalyseResults {
216
- private boolean pingsEveryone ;
217
- private boolean containsSuspiciousKeyword ;
218
- private boolean containsDollarSign ;
219
- private boolean hasUrl ;
220
- private boolean hasSuspiciousUrl ;
221
-
222
- @ Override
223
- public String toString () {
224
- return new StringJoiner (", " , AnalyseResults .class .getSimpleName () + "[" , "]" )
225
- .add ("pingsEveryone=" + pingsEveryone )
226
- .add ("containsSuspiciousKeyword=" + containsSuspiciousKeyword )
227
- .add ("containsDollarSign=" + containsDollarSign )
228
- .add ("hasUrl=" + hasUrl )
229
- .add ("hasSuspiciousUrl=" + hasSuspiciousUrl )
230
- .toString ();
231
- }
108
+ private boolean isAttachmentSuspicious (Attachment attachment ) {
109
+ return attachment .isImage () && isSuspiciousAttachmentName .test (attachment .fileName ());
232
110
}
233
111
}
0 commit comments