1
+ """Command line tools.
2
+
3
+ pattern-make:
4
+ Load URLs, cluster and generate URL pattern.
5
+
6
+ pattern-matcher:
7
+ Load pattern, match the URL and get matched results.
8
+
9
+ """
1
10
from __future__ import print_function
2
11
3
12
import argparse
4
- import logging
13
+ import logging . config
5
14
import sys
6
15
import time
7
16
from collections import Counter
8
- from logging .config import dictConfig
9
17
10
18
from .compat import binary_stdin , binary_stdout
11
19
from .config import get_default_config
15
23
from .formatter import FORMATTERS
16
24
from .pattern_maker import PatternMaker
17
25
from .pattern_matcher import PatternMatcher
18
- from .utils import LogSpeedAdapter , pretty_counter , MemoryUsageFormatter
26
+ from .utils import LogSpeedAdapter , MemoryUsageFormatter , pretty_counter
27
+
28
+ _DEFAULT_LOGGING = {
29
+ 'version' : 1 ,
30
+ 'disable_existing_loggers' : True ,
31
+ 'incremental' : True ,
32
+ }
19
33
20
34
21
35
def _config_logging (log_level ):
22
- dictConfig (_DEFAULT_LOGGING )
36
+ logging . config . dictConfig (_DEFAULT_LOGGING )
23
37
if log_level == 'NOTSET' :
24
38
handler = logging .NullHandler ()
25
39
else :
@@ -89,6 +103,7 @@ def add_argument(self, parser):
89
103
type = lambda s : s .upper ())
90
104
91
105
def _load (self , pattern_maker , args ):
106
+ load_url = args .formatter == 'CLUSTER'
92
107
stats = Counter ()
93
108
speed_logger = LogSpeedAdapter (self ._logger , 5000 )
94
109
for url in args .file [0 ]:
@@ -100,7 +115,8 @@ def _load(self, pattern_maker, args):
100
115
speed_logger .debug ('[LOADING]' )
101
116
try :
102
117
url = url .decode (DEFAULT_ENCODING )
103
- _ , is_new = pattern_maker .load (url )
118
+ _ , is_new = pattern_maker .load (
119
+ url , meta = url if load_url else None )
104
120
if is_new :
105
121
stats ['UNIQ' ] += 1
106
122
stats ['VALID' ] += 1
@@ -119,20 +135,18 @@ def _load(self, pattern_maker, args):
119
135
self ._logger .debug ('[LOADED] %s' , pretty_counter (stats ))
120
136
121
137
def _process (self , pattern_maker , args ):
138
+ combine = args .formatter == 'ETE'
122
139
formatter = FORMATTERS [args .formatter ]()
123
140
s = time .time ()
124
- combine = args .formatter == 'ETE'
125
141
for maker in pattern_maker .makers :
126
- for url_meta , root in maker .make (combine ):
142
+ for root in maker .make (combine ):
127
143
e = time .time ()
128
144
self ._logger .debug ('[CLUSTER] %d %.2fs' , root .count , e - s )
129
- for record in formatter .format (url_meta , root ):
145
+ for record in formatter .format (maker . url_meta , root ):
130
146
print (record )
131
147
s = time .time ()
132
148
133
149
def _confirm_config (self , args ):
134
- if args .formatter != 'CLUSTER' :
135
- self ._config .set ('make' , 'drop_url' , 'true' )
136
150
self ._config .freeze ()
137
151
138
152
def run (self , args ):
@@ -165,7 +179,7 @@ def add_argument(self, parser):
165
179
def _load (self , pattern_matcher , args ):
166
180
stats = Counter ()
167
181
io_input = args .pattern_file [0 ]
168
- self ._logger .debug ('[LOAD] Start %s' , io_input .name )
182
+ self ._logger .debug ('[LOAD] Pattrn file: %s' , io_input .name )
169
183
speed_logger = LogSpeedAdapter (self ._logger , 1000 )
170
184
for line in io_input :
171
185
speed_logger .debug ('[LOADING]' )
@@ -224,13 +238,6 @@ def run(self, args):
224
238
self ._match (pattern_matcher , args )
225
239
226
240
227
- _DEFAULT_LOGGING = {
228
- 'version' : 1 ,
229
- 'disable_existing_loggers' : True ,
230
- 'incremental' : True ,
231
- }
232
-
233
-
234
241
def _execute (command , argv = None ):
235
242
argv = argv or sys .argv
236
243
parser = argparse .ArgumentParser ()
0 commit comments