Skip to content

Commit 9acff5d

Browse files
committed
1. docstring.
2. remove drop_url in the default config. 3. rename the bad designed class name. 4. simple PatternMaker APIs. 5. more conditions on validate pattern unit parseing.
1 parent 9f1f99b commit 9acff5d

21 files changed

+396
-128
lines changed

src/os_urlpattern/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.1.8
1+
0.1.9

src/os_urlpattern/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""os-urlpattern.
2+
3+
Unsupervised URLs clustering, generate and match URL pattern.
4+
"""
15
import sys
26
__all__ = ['__version__', 'version_info']
37

src/os_urlpattern/cmdline.py

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
1+
"""Command line tools.
2+
3+
pattern-make:
4+
Load URLs, cluster and generate URL pattern.
5+
6+
pattern-matcher:
7+
Load pattern, match the URL and get matched results.
8+
9+
"""
110
from __future__ import print_function
211

312
import argparse
4-
import logging
13+
import logging.config
514
import sys
615
import time
716
from collections import Counter
8-
from logging.config import dictConfig
917

1018
from .compat import binary_stdin, binary_stdout
1119
from .config import get_default_config
@@ -15,11 +23,17 @@
1523
from .formatter import FORMATTERS
1624
from .pattern_maker import PatternMaker
1725
from .pattern_matcher import PatternMatcher
18-
from .utils import LogSpeedAdapter, pretty_counter, MemoryUsageFormatter
26+
from .utils import LogSpeedAdapter, MemoryUsageFormatter, pretty_counter
27+
28+
_DEFAULT_LOGGING = {
29+
'version': 1,
30+
'disable_existing_loggers': True,
31+
'incremental': True,
32+
}
1933

2034

2135
def _config_logging(log_level):
22-
dictConfig(_DEFAULT_LOGGING)
36+
logging.config.dictConfig(_DEFAULT_LOGGING)
2337
if log_level == 'NOTSET':
2438
handler = logging.NullHandler()
2539
else:
@@ -89,6 +103,7 @@ def add_argument(self, parser):
89103
type=lambda s: s.upper())
90104

91105
def _load(self, pattern_maker, args):
106+
load_url = args.formatter == 'CLUSTER'
92107
stats = Counter()
93108
speed_logger = LogSpeedAdapter(self._logger, 5000)
94109
for url in args.file[0]:
@@ -100,7 +115,8 @@ def _load(self, pattern_maker, args):
100115
speed_logger.debug('[LOADING]')
101116
try:
102117
url = url.decode(DEFAULT_ENCODING)
103-
_, is_new = pattern_maker.load(url)
118+
_, is_new = pattern_maker.load(
119+
url, meta=url if load_url else None)
104120
if is_new:
105121
stats['UNIQ'] += 1
106122
stats['VALID'] += 1
@@ -119,20 +135,18 @@ def _load(self, pattern_maker, args):
119135
self._logger.debug('[LOADED] %s', pretty_counter(stats))
120136

121137
def _process(self, pattern_maker, args):
138+
combine = args.formatter == 'ETE'
122139
formatter = FORMATTERS[args.formatter]()
123140
s = time.time()
124-
combine = args.formatter == 'ETE'
125141
for maker in pattern_maker.makers:
126-
for url_meta, root in maker.make(combine):
142+
for root in maker.make(combine):
127143
e = time.time()
128144
self._logger.debug('[CLUSTER] %d %.2fs', root.count, e - s)
129-
for record in formatter.format(url_meta, root):
145+
for record in formatter.format(maker.url_meta, root):
130146
print(record)
131147
s = time.time()
132148

133149
def _confirm_config(self, args):
134-
if args.formatter != 'CLUSTER':
135-
self._config.set('make', 'drop_url', 'true')
136150
self._config.freeze()
137151

138152
def run(self, args):
@@ -165,7 +179,7 @@ def add_argument(self, parser):
165179
def _load(self, pattern_matcher, args):
166180
stats = Counter()
167181
io_input = args.pattern_file[0]
168-
self._logger.debug('[LOAD] Start %s', io_input.name)
182+
self._logger.debug('[LOAD] Pattrn file: %s', io_input.name)
169183
speed_logger = LogSpeedAdapter(self._logger, 1000)
170184
for line in io_input:
171185
speed_logger.debug('[LOADING]')
@@ -224,13 +238,6 @@ def run(self, args):
224238
self._match(pattern_matcher, args)
225239

226240

227-
_DEFAULT_LOGGING = {
228-
'version': 1,
229-
'disable_existing_loggers': True,
230-
'incremental': True,
231-
}
232-
233-
234241
def _execute(command, argv=None):
235242
argv = argv or sys.argv
236243
parser = argparse.ArgumentParser()

src/os_urlpattern/compat.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
"""Compatible import.
2+
"""
3+
14
import operator
25
import string
36
import sys

src/os_urlpattern/config/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Configure.
2+
"""
13
from ..compat import RawConfigParser
24
try:
35
from collections import OrderedDict as _default_dict
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
[make]
2-
min_cluster_num = 3
3-
drop_url = false
2+
min_cluster_num = 3

src/os_urlpattern/definition.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
"""Definition of global constant varialbles.
2+
"""
3+
14
import hashlib
25

36
from .compat import (ascii_lowercase_unicode, ascii_uppercase_unicode,
@@ -42,7 +45,6 @@ class BasePatternRule(object):
4245
MULTI_DIGIT_AND_ASCII_UPPER = u'[0-9A-Z]+'
4346
MULTI_DIGIT_AND_ASCII = u'[0-9A-Za-z]+'
4447
DOT = u'\\.'
45-
ALL_MATCH = u'.*?'
4648
EMPTY = u''
4749
SINGLE_QUESTION = u'[\\?]'
4850

@@ -132,5 +134,4 @@ class BasePattern(object):
132134
BasePatternRule.MULTI_DIGIT_AND_ASCII_UPPER)
133135
MULTI_DIGIT_AND_ASCII = Pattern(BasePatternRule.MULTI_DIGIT_AND_ASCII)
134136
DOT = Pattern(BasePatternRule.DOT)
135-
ALL_MATCH = Pattern(BasePatternRule.ALL_MATCH)
136137
EMPTY = Pattern(BasePatternRule.EMPTY)

src/os_urlpattern/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""Custom Exceptions.
2+
"""
3+
4+
15
class IrregularURLException(Exception):
26
pass
37

src/os_urlpattern/formatter.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Clustered record formatter.
2+
"""
13
import json
24

35
from .definition import BasePatternRule, Symbols
@@ -23,13 +25,23 @@ def format(self, url_meta, clusterd, **kwargs):
2325
2426
"""
2527
yield
26-
return
2728

2829

2930
class PatternFormatter(Formatter):
3031
"""Pattern only formatter."""
3132

3233
def format(self, url_meta, root, **kwargs):
34+
"""Generate url pattern string.
35+
36+
Args:
37+
url_meta (URLMeta): The URLMeta object.
38+
root (TreeNode): Root of a clusterd piece tree.
39+
**kwargs: Arbitray keyword arguments.
40+
41+
Yields:
42+
str: URL pattern string.
43+
44+
"""
3345
for nodes in dump_tree(root):
3446
yield pack(url_meta, [p.pattern for p in nodes[1:]])
3547
break
@@ -39,6 +51,18 @@ class ClusterFormatter(PatternFormatter):
3951
"""Pattern and meta formatter."""
4052

4153
def format(self, url_meta, root, **kwargs):
54+
"""Generate url pattern string and dumps bound meta data.
55+
56+
Args:
57+
url_meta (URLMeta): The URLMeta object.
58+
root (TreeNode): Root of a clusterd piece tree.
59+
**kwargs: Arbitray keyword arguments.
60+
61+
Yields:
62+
str: Yield URL pattern string first, then yield meta
63+
data of this cluster.
64+
65+
"""
4266
for r in super(ClusterFormatter, self).format(url_meta, root, **kwargs):
4367
yield r
4468

@@ -51,6 +75,18 @@ class JsonFormatter(Formatter):
5175
"""Json record of pattern info formatter."""
5276

5377
def format(self, url_meta, root, **kwargs):
78+
"""Generate json format of URL pattern and relative info.
79+
80+
Args:
81+
url_meta (URLMeta): The URLMeta object.
82+
root (TreeNode): Root of a clusterd piece tree.
83+
**kwargs: Arbitray keyword arguments.
84+
85+
Yields:
86+
str: Json string, key-value:
87+
ptn: URL pattern string.
88+
cnt: Number of uniq path in the cluster.
89+
"""
5490
for nodes in dump_tree(root):
5591
p = pack(url_meta, [p.pattern for p in nodes[1:]])
5692
yield json.dumps({u'ptn': p, u'cnt': root.count})
@@ -61,7 +97,16 @@ class ETEFormatter(Formatter):
6197
"""Ete tree formatter."""
6298

6399
def format(self, url_meta, root, **kwargs):
100+
"""Generate ete tree string.
101+
102+
Args:
103+
url_meta (URLMeta): The URLMeta object.
104+
root (TreeNode): Root of a pattern tree.
105+
**kwargs: Arbitray keyword arguments.
64106
107+
Yields:
108+
str: An ete tree string.
109+
"""
65110
def f(pattern_node):
66111
sep = Symbols.EMPTY
67112
query_key = Symbols.EMPTY

src/os_urlpattern/parse_utils.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Utilitis for parsing URL and pattern.
2+
"""
13
import hashlib
24
from collections import namedtuple
35

@@ -225,7 +227,7 @@ def _filterd(x):
225227

226228

227229
def parse_query_string(query_string):
228-
"""Parse query string into keys and values
230+
"""Parse query string into keys and values.
229231
230232
Args:
231233
query_string (str): The string to be parsed.
@@ -242,7 +244,7 @@ def parse_query_string(query_string):
242244
return BLANK_TUPLE, BLANK_TUPLE
243245
elif query_string.endswith(Symbols.AMPERSAND):
244246
raise IrregularURLException("Invalid '&' pos")
245-
kv_type = True # qkey True, qvalue False
247+
kv_type = True # query_key True, query_value False
246248
last_c = None
247249
kv_buf = {True: StringIO(), False: StringIO()}
248250
kv_list = {True: [], False: []}
@@ -526,6 +528,19 @@ def _reset(self):
526528
self._piece_list = []
527529

528530
def parse(self, piece):
531+
"""Parse a string into small sub-pieces with rules.
532+
533+
The consecutive charactors in the same charactor space
534+
will be joined into one sub-piece, the corresponding
535+
rule(charactor space) can also be got.
536+
537+
Args:
538+
piece (str): A string to be parsed.
539+
540+
Returns:
541+
tuple: 2-tuple, (pieces, rules).
542+
"""
543+
529544
self._reset()
530545
self._preprocess(piece)
531546
return self._create_parsed_piece()
@@ -614,7 +629,7 @@ def analyze_url_pattern_string(url_pattern_string):
614629
url_pattern_string (str): The URL pattern string to be parsed.
615630
616631
Returns:
617-
tuple: A 2-tuple, (url_meta, pattern_string_pieces).
632+
tuple: A 2-tuple, (url_meta, pattern_strings).
618633
"""
619634
result = parse_url_pattern_string(url_pattern_string)
620635
return unpack(result, False)
@@ -736,8 +751,9 @@ def parse_pattern_unit_string(pattern_unit_string):
736751
n = 1
737752
rule = p_str[idx:idx + n]
738753
if rule not in RULE_SET:
739-
raise InvalidPatternException(
740-
"Invalid pattern unit: %s" % pattern_unit_string)
754+
raise InvalidPatternException("Invalid rule '%s'" % rule)
741755
rules.add(rule)
742756
idx += n
757+
if (num > 0 and len(rules) > num) or num == 0:
758+
raise InvalidPatternException('Insufficient number')
743759
return rules, num

0 commit comments

Comments
 (0)