diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/SelectorLib.iml b/.idea/SelectorLib.iml new file mode 100644 index 0000000..5195124 --- /dev/null +++ b/.idea/SelectorLib.iml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/discord.xml b/.idea/discord.xml new file mode 100644 index 0000000..d8e9561 --- /dev/null +++ b/.idea/discord.xml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..e9c75c4 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,201 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d0f0c37 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..a11b2cc --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/selectorlib/__init__.py b/selectorlib/__init__.py index 4b8c93b..2364f3e 100644 --- a/selectorlib/__init__.py +++ b/selectorlib/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -"""Top-level package for selectorlib.""" +"""Top-level package for Selectorlib.""" __author__ = """scrapehero""" __email__ = 'pypi@scrapehero.com' diff --git a/selectorlib/cli.py b/selectorlib/cli.py index 86082cc..7495957 100644 --- a/selectorlib/cli.py +++ b/selectorlib/cli.py @@ -1,15 +1,15 @@ # -*- coding: utf-8 -*- -"""Console script for selectorlib.""" +"""Console script for Selectorlib.""" import sys import click @click.command() def main(args=None): - """Console script for selectorlib.""" + """Console script for Selectorlib.""" click.echo("Replace this message by putting your code into " - "selectorlib.cli.main") + "Selectorlib.cli.main") click.echo("See click documentation at http://click.pocoo.org/") return 0 diff --git a/selectorlib/selectorlib.py b/selectorlib/selectorlib.py index 7e59f18..c4d21a2 100644 --- a/selectorlib/selectorlib.py +++ b/selectorlib/selectorlib.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- +import re import parsel import yaml import inspect - def extract_field(element, item_type, attribute=None, formatter=None): if item_type == 'Text': texts = [i.strip() for i in element.xpath('.//text()').getall() if i.strip()] @@ -35,7 +35,7 @@ def __init__(self, config, formatters=None): def from_yaml_string(cls, yaml_string: str, formatters=None): """create `Extractor` object from yaml string - >>> yaml_string = ''' + >>> yaml_string = '' title: css: "h1" type: Text @@ -77,13 +77,19 @@ def extract(self, html: str, base_url: str = None): def _extract_selector(self, field_config, parent_parser): if field_config.get("xpath") is not None: elements = parent_parser.xpath(field_config['xpath']) + if len(elements) == 0: + if field_config.get("xpath_alias") is not None: + elements = parent_parser.xpath(field_config['alias']) + else: css = field_config['css'] if css == '': elements = [parent_parser] else: elements = parent_parser.css(field_config['css']) + item_type = field_config.get('type', 'Text') + # print(field_config) # Returns all fields if not elements: return None values = [] @@ -100,6 +106,11 @@ def _extract_selector(self, field_config, parent_parser): value = extract_field(element, item_type, **kwargs) if field_config.get('multiple') is not True: + if 're' in field_config: + pattern = re.compile(f'{field_config.get("re")}') + regex = re.sub(pattern, '', value) + return regex + return value else: values.append(value)