diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 0b41f6b..79b0847 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,6 +1,6 @@ -extern crate unicode_normalization; /// Unicode lexer for the HER language. /// Some functions taken from `rust/compiler/rustc_lexer/src/lib.rs`. +extern crate unicode_normalization; extern crate unicode_xid; use crate::token::Token; @@ -20,79 +20,39 @@ pub fn nfc_normalize(string: &str) -> String { /// True if `c` is considered a whitespace according to HER. Does not include \n. pub fn is_whitespace(c: char) -> bool { - // This is Pattern_White_Space minus \n. - // - // Note that this set is stable (ie, it doesn't change with different - // Unicode versions), so it's ok to just hard-code the values. - matches!( c, - // Usual ASCII suspects - '\u{0009}' // \t - | '\u{000C}' // form feed - | '\u{000D}' // \r - | '\u{000B}' // vertical tab - | '\u{0020}' // space - - // NEXT LINE from latin1 - | '\u{0085}' - - // Bidi markers - | '\u{200E}' // LEFT-TO-RIGHT MARK - | '\u{200F}' // RIGHT-TO-LEFT MARK - - // Dedicated whitespace characters from Unicode - | '\u{2028}' // LINE SEPARATOR - | '\u{2029}' // PARAGRAPH SEPARATOR + '\u{0009}' | '\u{000C}' | '\u{000D}' | '\u{000B}' | '\u{0020}' | '\u{0085}' + | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' ) } /// Decide whether character may show up in emoji. -/// We cannot validate the entire sequence given the current architecture. fn is_emoji_like(c: char) -> bool { if c < '\x7f' { false } else { - // ZWJ - c == '\u{200D}' - // VS15, 16 - || c == '\u{fe0f}' || c == '\u{fe0e}' - // Big SMP chunk (includes modifiers and by accident chess) - || ('\u{1f000}'..='\u{1faff}').contains(&c) - // The BMP parts that follow are actually quite questionable - || c == '\u{2139}' - // (unstable!) Arrows, not sure if we will repocess them for operators! - || ('\u{2190}'..='\u{21FF}').contains(&c) - || ('\u{2300}'..='\u{23FF}').contains(&c) - || ('\u{25A0}'..='\u{25FF}').contains(&c) - || ('\u{2600}'..='\u{26FF}').contains(&c) - // (unstable!) Dingbats, some are unfortunately punctuations - || ('\u{2700}'..='\u{27FF}').contains(&c) - // Too lazy to do 2800-329f, will come back later + c == '\u{200D}' || c == '\u{fe0f}' || c == '\u{fe0e}' + || ('\u{1f000}'..='\u{1faff}').contains(&c) + || c == '\u{2139}' + || ('\u{2190}'..='\u{21FF}').contains(&c) + || ('\u{2300}'..='\u{23FF}').contains(&c) + || ('\u{25A0}'..='\u{25FF}').contains(&c) + || ('\u{2600}'..='\u{26FF}').contains(&c) + || ('\u{2700}'..='\u{27FF}').contains(&c) } } /// True if `c` is valid as a first character of an identifier. -/// Compared to Rust, we additionally allow $ and ¥. fn is_id_start(c: char) -> bool { - c.is_ascii_lowercase() - || c.is_ascii_uppercase() - || c == '_' - || c == '$' - || c == '¥' + c.is_ascii_alphabetic() || c == '_' || c == '$' || c == '¥' || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) || is_emoji_like(c) } /// True if `c` is valid as a non-first character of an identifier. -/// Compared to Rust, we additionally allow $ and ¥. fn is_id_continue(c: char) -> bool { - c.is_ascii_lowercase() - || c.is_ascii_uppercase() - || c.is_ascii_digit() - || c == '_' - || c == '$' - || c == '¥' + c.is_ascii_alphanumeric() || c == '_' || c == '$' || c == '¥' || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) || is_emoji_like(c) } @@ -145,6 +105,8 @@ impl Lexer { loop { if is_whitespace(self.ch) { self.read_char(); + } else if self.ch == '\n' { + self.read_char(); } else { break; } @@ -154,6 +116,47 @@ impl Lexer { pub fn next_token(&mut self) -> Token { self.skip_whitespace(); + if self.ch == '其' { + let mut comment = String::new(); + comment.push(self.ch); + self.read_char(); + + if self.ch == '其' { + self.read_char(); + if self.ch == '其' { + self.read_char(); + if self.ch == '其' { + self.read_char(); + if self.ch == '其' { + self.read_char(); + if self.ch == '其' { + self.read_char(); + if self.ch == '其' { + self.read_char(); + if self.ch == '其' { + self.read_char(); + if self.ch == '其' { + self.read_char(); + if self.ch == '其' { + self.read_char(); + } + } + } + } + } + } + } + } + } + + while self.ch != '\n' && self.ch != '\0' { + comment.push(self.ch); + self.read_char(); + } + + return Token::Comment(comment); + } + let tok = match self.ch { '=' => { if self.nextch_is('=') { @@ -207,14 +210,7 @@ impl Lexer { '"' => { return self.consume_string(); } - '\n' => { - if self.nextch_is('\n') { - Token::Blank - } else { - self.read_char(); - return self.next_token(); - } - } + '\n' => Token::Blank, '\0' => Token::Eof, _ => { if is_id_start(self.ch) { @@ -272,6 +268,7 @@ impl Lexer { "差异" => Token::Minus, "种草" => Token::Asterisk, "踩雷" => Token::Slash, + "" => Token::Ident(nfc_normalize(&literal)), _ => Token::Ident(nfc_normalize(&literal)), } } @@ -319,7 +316,6 @@ impl Lexer { } self.read_char(); } - // FIXME: Make Lexer faliable Token::String("".to_string()) } } @@ -357,8 +353,8 @@ if (5 < 10) { [1, 2]; - {"foo": "bar"}; +其实我觉得 这是一个注释 "#; let tests = vec![ @@ -466,6 +462,7 @@ if (5 < 10) { Token::String(String::from("bar")), Token::Rbrace, Token::Semicolon, + Token::Comment(String::from("其实我觉得 这是一个注释")), Token::Eof, ]; @@ -494,6 +491,7 @@ if (5 < 10) { }; fib(10); +其实我觉得 这是一个注释 "#; let tests = vec![ @@ -505,6 +503,8 @@ fib(10); Token::Ident(String::from("n")), Token::Rparen, Token::Lbrace, + Token::Comment(String::from("其实我觉得 这是一个注释")), + Token::Eof, ]; let mut lexer = Lexer::new(input);