diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/SelectorLib.iml b/.idea/SelectorLib.iml
new file mode 100644
index 0000000..5195124
--- /dev/null
+++ b/.idea/SelectorLib.iml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/discord.xml b/.idea/discord.xml
new file mode 100644
index 0000000..d8e9561
--- /dev/null
+++ b/.idea/discord.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..e9c75c4
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,201 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..d0f0c37
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..a11b2cc
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/selectorlib/__init__.py b/selectorlib/__init__.py
index 4b8c93b..2364f3e 100644
--- a/selectorlib/__init__.py
+++ b/selectorlib/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-"""Top-level package for selectorlib."""
+"""Top-level package for Selectorlib."""
__author__ = """scrapehero"""
__email__ = 'pypi@scrapehero.com'
diff --git a/selectorlib/cli.py b/selectorlib/cli.py
index 86082cc..7495957 100644
--- a/selectorlib/cli.py
+++ b/selectorlib/cli.py
@@ -1,15 +1,15 @@
# -*- coding: utf-8 -*-
-"""Console script for selectorlib."""
+"""Console script for Selectorlib."""
import sys
import click
@click.command()
def main(args=None):
- """Console script for selectorlib."""
+ """Console script for Selectorlib."""
click.echo("Replace this message by putting your code into "
- "selectorlib.cli.main")
+ "Selectorlib.cli.main")
click.echo("See click documentation at http://click.pocoo.org/")
return 0
diff --git a/selectorlib/selectorlib.py b/selectorlib/selectorlib.py
index 7e59f18..c4d21a2 100644
--- a/selectorlib/selectorlib.py
+++ b/selectorlib/selectorlib.py
@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
+import re
import parsel
import yaml
import inspect
-
def extract_field(element, item_type, attribute=None, formatter=None):
if item_type == 'Text':
texts = [i.strip() for i in element.xpath('.//text()').getall() if i.strip()]
@@ -35,7 +35,7 @@ def __init__(self, config, formatters=None):
def from_yaml_string(cls, yaml_string: str, formatters=None):
"""create `Extractor` object from yaml string
- >>> yaml_string = '''
+ >>> yaml_string = ''
title:
css: "h1"
type: Text
@@ -77,13 +77,19 @@ def extract(self, html: str, base_url: str = None):
def _extract_selector(self, field_config, parent_parser):
if field_config.get("xpath") is not None:
elements = parent_parser.xpath(field_config['xpath'])
+ if len(elements) == 0:
+ if field_config.get("xpath_alias") is not None:
+ elements = parent_parser.xpath(field_config['alias'])
+
else:
css = field_config['css']
if css == '':
elements = [parent_parser]
else:
elements = parent_parser.css(field_config['css'])
+
item_type = field_config.get('type', 'Text')
+ # print(field_config) # Returns all fields
if not elements:
return None
values = []
@@ -100,6 +106,11 @@ def _extract_selector(self, field_config, parent_parser):
value = extract_field(element, item_type, **kwargs)
if field_config.get('multiple') is not True:
+ if 're' in field_config:
+ pattern = re.compile(f'{field_config.get("re")}')
+ regex = re.sub(pattern, '', value)
+ return regex
+
return value
else:
values.append(value)