From 9b029a8b6ae40cb8cae2a506d2380a4077afb51b Mon Sep 17 00:00:00 2001 From: Kabir Maniar Date: Thu, 23 Sep 2021 09:34:04 +0530 Subject: [PATCH 1/4] Added tests for numbers in the lexer changed the print_tokens to return_token --- internet_object/parsers/lexer_tests.py | 30 +- internet_object/parsers/lexers.py | 418 +++++++++++++------------ 2 files changed, 225 insertions(+), 223 deletions(-) diff --git a/internet_object/parsers/lexer_tests.py b/internet_object/parsers/lexer_tests.py index a9c1135..bf8bc3f 100644 --- a/internet_object/parsers/lexer_tests.py +++ b/internet_object/parsers/lexer_tests.py @@ -1,23 +1,23 @@ +import math +from re import split import unittest from lexers import Lexer class LexerTest(unittest.TestCase): + print("") - def test(self): - - lexer = Lexer(r""" - ~ "man\"iar - ", - ~ 10, T, F, 20, test: abc # This is a comment - --- ab, b, N, testing, - ~ { aamir: maniar , -200.50 , 'adsfasdf' } - ~ "test " "Wow" - """) - lexer.read_all() - lexer.print_tokens() - self.assertTrue(True) - + def test_number(self): + tests = {"213": 213, "37.697": 37.697, "-68": -68, "-348.978": -348.978, "1.10e+20": 1.10e+20, "0xff": + hex(255), "0o77": oct(63), str(math.pi): math.pi, f"{str(oct(34))}": oct(34), f"{str(bin(52346))}": bin(52346), + "Inf": "Inf", "NaN": "NaN"} + for i in tests: + lex = Lexer(i+" ") + lex.read_all() + token = lex.return_tokens() + tokenVal = token.__dict__["val"] + self.assertTrue( + tokenVal == tests[i], f"Expected : {tests[i]}, Got : {tokenVal}") if __name__ == '__main__': - unittest.main() + unittest.main() diff --git a/internet_object/parsers/lexers.py b/internet_object/parsers/lexers.py index 9dd9506..5b34d86 100644 --- a/internet_object/parsers/lexers.py +++ b/internet_object/parsers/lexers.py @@ -5,221 +5,223 @@ class Lexer(): - """ - Lexer helps parsing internet object text into tokens. - """ - - def __init__(self, text): - self._index = -1 - self._text = text - self._done = False - self._col = 0 - self._row = 0 - self._tokens = [] - self._len = len(text) - self.advance() - - @property - def done(self): - return self._done - - def read_all(self): - token = self.read() - - while(self._done is not True): - if token: - self._tokens.append(token) - token = self.read() - - if self._done and token: - self._tokens.append(token) - - @property - def tokens(self): - return self._tokens - - def print_tokens(self): - for token in self._tokens: - print('-', token) - - def read(self): + """ + Lexer helps parsing internet object text into tokens. + """ + + def __init__(self, text): + self._index = -1 + self._text = text + self._done = False + self._col = 0 + self._row = 0 + self._tokens = [] + self._len = len(text) + self.advance() + + @property + def done(self): + return self._done + + def read_all(self): + token = self.read() + + while(self._done is not True): + if token: + self._tokens.append(token) + token = self.read() + + if self._done and token: + self._tokens.append(token) + + @property + def tokens(self): + return self._tokens + + def return_tokens(self): + for token in self._tokens: + # print('-', token) + return token + + def read(self): + + if self._done is True: + return - if self._done is True: - return + ch = self._ch + ch_code = self._ch_code + + # print("+++", repr(ch), self._index) + should_advance = False + token = None + + # Validators + + is_datasep = False + if ch == '-': + is_datasep = self.is_datasep + + # Scanner and Processor + if ch_code <= 32: + self.scan('ws', + lambda a, b: self._ch_code <= 32, + True) + + # Scan regular string + elif ch == '"': + token = self.scan("string", self.string_scanner, confined=True) + self.advance() + + # Scan raw string + elif ch == "'": + token = self.scan("string", self.raw_string_scanner, confined=True) + self.advance() + + elif ch == '#': + token = self.scan('comment', self.comment_scanner) + + elif is_datasep: + token = Token("---", "datasep", self._index, + self._index + 3, self._row, self._col) + self.advance(3) + + # Process separator + elif re_separator.match(ch): + # self._index += 1 + token = Token(ch, 'sep', self._index, + self._index, self._row, self._col) + self.advance() + + else: + # Scan everything else + token = self.scan("string", self.sep_scanner) + value, token_type = self.process_open_values(token.token) + token.val = value + token.type = token_type + + return token + + def ws_scanner(self, start, end): + return self._ch_code <= 32 + + def advance(self, times=1): + advanced = 1 + try: + self._ch = self._text[self._index+1] + self._ch_code = ord(self._ch) + + self._index += 1 + self._col += 1 + + if self._ch == '\n': + self._col = 1 + self._row += 1 + + result = True + while advanced < times: + result = self.advance() + advanced += 1 + + return result + + except IndexError: # End of the text + self._ch = None + self._ch_code = -1 + self._done = True + self._index = len(self._text) - 1 + return False + + def scan(self, token_type, scanner, confined=False, skip=False): + + start = self._index + + while self.advance(): + + # Reached the end of the text, break it + if self._done is True: + break + + if scanner(start, self._index) is False: + break + + token = self._text[start:self._index + (1 if confined else 0)].strip() + return None if skip else (Token(token, token_type, + start, start + len(token)-1, self._row, self._col)) + + # Validators, Scanners and Processors + @property + def is_datasep(self): + start = self._index + end = self._index + 3 + try: + token = self._text[start:end] + next_ch = self._text[start+3] + return token == "---" and next_ch != "-" + + except IndexError: + return False + + def string_scanner(self, start, end): + if self._ch != '"': + if self._index == self._len - 1: + raise SyntaxError("incomplete-string (%s, %s)" % + (self._row, self._col,)) + return True + + token = self._text[start:self._index+1] + return re_regular_string.match(token) is None + + def raw_string_scanner(self, start, end): + if self._ch != "'": + if self._index == self._len - 1: + raise SyntaxError("incomplete-string (%s, %s)" % + (self._row, self._col,)) + return True + + # If next ch is ' too, ignore it + try: + next_ch = self._text[self._index+1] + if next_ch == "'": + return True + except IndexError: + return False + + token = self._text[start:self._index+1] + return re_raw_string.match(token) is None + + def sep_scanner(self, start, end): + if re_separator.match(self._ch) is not None: + return False + + elif self._ch == '#': + return False + + if self._ch == "-": + return not self.is_datasep - ch = self._ch - ch_code = self._ch_code - - # print("+++", repr(ch), self._index) - should_advance = False - token = None - - # Validators - - is_datasep = False - if ch == '-': - is_datasep = self.is_datasep - - # Scanner and Processor - if ch_code <= 32: - self.scan('ws', - lambda a, b: self._ch_code <= 32, - True) - - # Scan regular string - elif ch == '"': - token = self.scan("string", self.string_scanner, confined=True) - self.advance() - - # Scan raw string - elif ch == "'": - token = self.scan("string", self.raw_string_scanner, confined=True) - self.advance() - - elif ch == '#': - token = self.scan('comment', self.comment_scanner) - - elif is_datasep: - token = Token("---", "datasep", self._index, - self._index + 3, self._row, self._col) - self.advance(3) - - # Process separator - elif re_separator.match(ch): - # self._index += 1 - token = Token(ch, 'sep', self._index, - self._index, self._row, self._col) - self.advance() - - else: - # Scan everything else - token = self.scan("string", self.sep_scanner) - value, token_type = self.process_open_values(token.token) - token.val = value - token.type = token_type - - return token - - def ws_scanner(self, start, end): - return self._ch_code <= 32 - - def advance(self, times=1): - advanced = 1 - try: - self._ch = self._text[self._index+1] - self._ch_code = ord(self._ch) - - self._index += 1 - self._col += 1 - - if self._ch == '\n': - self._col = 1 - self._row += 1 - - result = True - while advanced < times: - result = self.advance() - advanced += 1 - - return result - - except IndexError: # End of the text - self._ch = None - self._ch_code = -1 - self._done = True - self._index = len(self._text) - 1 - return False - - def scan(self, token_type, scanner, confined=False, skip=False): - - start = self._index - - while self.advance(): - - # Reached the end of the text, break it - if self._done is True: - break - - if scanner(start, self._index) is False: - break - - token = self._text[start:self._index + (1 if confined else 0)].strip() - return None if skip else (Token(token, token_type, - start, start + len(token)-1, self._row, self._col)) - - # Validators, Scanners and Processors - @property - def is_datasep(self): - start = self._index - end = self._index + 3 - try: - token = self._text[start:end] - next_ch = self._text[start+3] - return token == "---" and next_ch != "-" - - except IndexError: - return False - - def string_scanner(self, start, end): - if self._ch != '"': - if self._index == self._len - 1: - raise SyntaxError("incomplete-string (%s, %s)" % - (self._row, self._col,)) - return True - - token = self._text[start:self._index+1] - return re_regular_string.match(token) is None - - def raw_string_scanner(self, start, end): - if self._ch != "'": - if self._index == self._len - 1: - raise SyntaxError("incomplete-string (%s, %s)" % - (self._row, self._col,)) - return True - - # If next ch is ' too, ignore it - try: - next_ch = self._text[self._index+1] - if next_ch == "'": return True - except IndexError: - return False - token = self._text[start:self._index+1] - return re_raw_string.match(token) is None + def comment_scanner(self, start, end): + return self._ch != '\n' - def sep_scanner(self, start, end): - if re_separator.match(self._ch) is not None: - return False + def process_open_values(self, token): - elif self._ch == '#': - return False + if token == 'T' or token == 'true': + return True, 'bool' - if self._ch == "-": - return not self.is_datasep + elif token == 'F' or token == 'false': + return False, 'bool' - return True + elif token == 'N' or token == 'null': + return None, 'null' - def comment_scanner(self, start, end): - return self._ch != '\n' + elif re_number.match(token) is not None: + try: + return ( + int(token) if re.search( + r"[\.eE]", token) is None else float(token) + ), 'number' + except ValueError: + pass - def process_open_values(self, token): - - if token == 'T' or token == 'true': - return True, 'bool' - - elif token == 'F' or token == 'false': - return False, 'bool' - - elif token == 'N' or token == 'null': - return None, 'null' - - elif re_number.match(token) is not None: - try: - return ( - int(token) if re.search(r"[\.eE]", token) is None else float(token) - ), 'number' - except ValueError: - pass - - return token, "string" + return token, "string" From db0408bb4b3b01b56fb421ec721ba948d3fba4a6 Mon Sep 17 00:00:00 2001 From: Kabir Maniar Date: Thu, 23 Sep 2021 09:37:01 +0530 Subject: [PATCH 2/4] Added lexer tests for strings --- internet_object/parsers/lexer_tests.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/internet_object/parsers/lexer_tests.py b/internet_object/parsers/lexer_tests.py index bf8bc3f..b881538 100644 --- a/internet_object/parsers/lexer_tests.py +++ b/internet_object/parsers/lexer_tests.py @@ -19,5 +19,16 @@ def test_number(self): self.assertTrue( tokenVal == tests[i], f"Expected : {tests[i]}, Got : {tokenVal}") + def test_strings(self): + tests = {'"hello"': '"hello"', 'World': "World", "'Hello \n'": "'Hello \n'", '"Test \n Test2"': '"Test \n Test2"', "'Tab\ttest'": """'Tab\ttest'""", + "'World'": "'World'", "'as\"df'": "'as\"df'", "''": "''", '""': '""', "test": "test"} + for i in tests: + lex = Lexer(i+" ") + lex.read_all() + token = lex.return_tokens() + tokenVal = token.__dict__["val"] + self.assertTrue( + tokenVal == tests[i], f"Expected : {tests[i]}, Got : {tokenVal}") + if __name__ == '__main__': unittest.main() From ef531cd0ea7ecb9b8751dd5c9de032f443d8f943 Mon Sep 17 00:00:00 2001 From: Kabir Maniar Date: Thu, 23 Sep 2021 09:37:23 +0530 Subject: [PATCH 3/4] Added lexer tests for boolean --- internet_object/parsers/lexer_tests.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/internet_object/parsers/lexer_tests.py b/internet_object/parsers/lexer_tests.py index b881538..11de401 100644 --- a/internet_object/parsers/lexer_tests.py +++ b/internet_object/parsers/lexer_tests.py @@ -30,5 +30,14 @@ def test_strings(self): self.assertTrue( tokenVal == tests[i], f"Expected : {tests[i]}, Got : {tokenVal}") + def test_boolean(self): + tests = {"T": True, "F": False, "false": False, "true": True} + for i in tests: + lex = Lexer(i+" ") + lex.read_all() + token = lex.return_tokens() + tokenVal = token.__dict__["val"] + self.assertTrue( + tokenVal == tests[i], f"Expected : {tests[i]}, Got : {tokenVal}") if __name__ == '__main__': unittest.main() From 23823bd1a12a8fd5b0583a0f44029fc4387ac4d6 Mon Sep 17 00:00:00 2001 From: Kabir Maniar Date: Thu, 23 Sep 2021 09:37:41 +0530 Subject: [PATCH 4/4] Added lexer tests for null --- internet_object/parsers/lexer_tests.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/internet_object/parsers/lexer_tests.py b/internet_object/parsers/lexer_tests.py index 11de401..e85f436 100644 --- a/internet_object/parsers/lexer_tests.py +++ b/internet_object/parsers/lexer_tests.py @@ -39,5 +39,15 @@ def test_boolean(self): tokenVal = token.__dict__["val"] self.assertTrue( tokenVal == tests[i], f"Expected : {tests[i]}, Got : {tokenVal}") + + def test_null(self): + tests = {"null": None,"N":None} + for i in tests: + lex = Lexer(i+" ") + lex.read_all() + token = lex.return_tokens() + tokenVal = token.__dict__["val"] + self.assertTrue( + tokenVal == tests[i], f"Expected : {tests[i]}, Got : {tokenVal}") if __name__ == '__main__': unittest.main()