diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-08 08:11:50 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-08 08:11:50 +0000 |
commit | 8978da3b39d7ca3cf83ee30fcc63ffe0e5453fb2 (patch) | |
tree | 2e29f131dff77b31e84c957266de8f18655b6f88 /sqlglotrs/src/tokenizer.rs | |
parent | Adding upstream version 22.2.0. (diff) | |
download | sqlglot-8978da3b39d7ca3cf83ee30fcc63ffe0e5453fb2.tar.xz sqlglot-8978da3b39d7ca3cf83ee30fcc63ffe0e5453fb2.zip |
Adding upstream version 23.7.0.upstream/23.7.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sqlglotrs/src/tokenizer.rs')
-rw-r--r-- | sqlglotrs/src/tokenizer.rs | 85 |
1 files changed, 58 insertions, 27 deletions
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs index 2c90a65..cdc61d7 100644 --- a/sqlglotrs/src/tokenizer.rs +++ b/sqlglotrs/src/tokenizer.rs @@ -118,8 +118,27 @@ impl<'a> TokenizerState<'a> { fn scan(&mut self, until_peek_char: Option<char>) -> Result<(), TokenizerError> { while self.size > 0 && !self.is_end { - self.start = self.current; - self.advance(1)?; + let mut current = self.current; + + // Skip spaces here rather than iteratively calling advance() for performance reasons + while current < self.size { + let ch = self.char_at(current)?; + + if ch == ' ' || ch == '\t' { + current += 1; + } else { + break; + } + } + + let offset = if current > self.current { + current - self.current + } else { + 1 + }; + + self.start = current; + self.advance(offset as isize)?; if self.current_char == '\0' { break; @@ -153,16 +172,12 @@ impl<'a> TokenizerState<'a> { } fn advance(&mut self, i: isize) -> Result<(), TokenizerError> { - let mut i = i; if Some(&self.token_types.break_) == self.settings.white_space.get(&self.current_char) { // Ensures we don't count an extra line if we get a \r\n line break sequence. - if self.current_char == '\r' && self.peek_char == '\n' { - i = 2; - self.start += 1; + if !(self.current_char == '\r' && self.peek_char == '\n') { + self.column = 1; + self.line += 1; } - - self.column = 1; - self.line += 1; } else { self.column = self.column.wrapping_add_signed(i); } @@ -404,11 +419,19 @@ impl<'a> TokenizerState<'a> { }; self.advance(1)?; + let tag = if self.current_char.to_string() == *end { String::from("") } else { - self.extract_string(end, false)? + self.extract_string(end, false, false, !self.settings.heredoc_tag_is_identifier)? }; + + if self.is_end && !tag.is_empty() && self.settings.heredoc_tag_is_identifier { + self.advance(-(tag.len() as isize))?; + self.add(self.token_types.heredoc_string_alternative, None)?; + return Ok(true) + } + (None, *token_type, format!("{}{}{}", start, tag, end)) } else { (None, *token_type, end.clone()) @@ -418,7 +441,7 @@ impl<'a> TokenizerState<'a> { }; self.advance(start.len() as isize)?; - let text = self.extract_string(&end, false)?; + let text = self.extract_string(&end, false, token_type != self.token_types.raw_string, true)?; if let Some(b) = base { if u64::from_str_radix(&text, b).is_err() { @@ -561,7 +584,7 @@ impl<'a> TokenizerState<'a> { fn scan_identifier(&mut self, identifier_end: &str) -> Result<(), TokenizerError> { self.advance(1)?; - let text = self.extract_string(identifier_end, true)?; + let text = self.extract_string(identifier_end, true, true, true)?; self.add(self.token_types.identifier, Some(text)) } @@ -569,6 +592,8 @@ impl<'a> TokenizerState<'a> { &mut self, delimiter: &str, use_identifier_escapes: bool, + unescape_sequences: bool, + raise_unmatched: bool, ) -> Result<String, TokenizerError> { let mut text = String::from(""); @@ -578,8 +603,23 @@ impl<'a> TokenizerState<'a> { } else { &self.settings.string_escapes }; - let peek_char_str = self.peek_char.to_string(); + + if unescape_sequences + && !self.dialect_settings.unescaped_sequences.is_empty() + && !self.peek_char.is_whitespace() + && self.settings.string_escapes.contains(&self.current_char) + { + let sequence_key = format!("{}{}", self.current_char, self.peek_char); + if let Some(unescaped_sequence) = + self.dialect_settings.unescaped_sequences.get(&sequence_key) + { + self.advance(2)?; + text.push_str(unescaped_sequence); + continue; + } + } + if escapes.contains(&self.current_char) && (peek_char_str == delimiter || escapes.contains(&self.peek_char)) && (self.current_char == self.peek_char @@ -610,26 +650,17 @@ impl<'a> TokenizerState<'a> { break; } if self.is_end { + if !raise_unmatched { + text.push(self.current_char); + return Ok(text) + } + return self.error_result(format!( "Missing {} from {}:{}", delimiter, self.line, self.current )); } - if !self.dialect_settings.escape_sequences.is_empty() - && !self.peek_char.is_whitespace() - && self.settings.string_escapes.contains(&self.current_char) - { - let sequence_key = format!("{}{}", self.current_char, self.peek_char); - if let Some(escaped_sequence) = - self.dialect_settings.escape_sequences.get(&sequence_key) - { - self.advance(2)?; - text.push_str(escaped_sequence); - continue; - } - } - let current = self.current - 1; self.advance(1)?; text.push_str( |