diff --git a/vlib/strings/textscanner/textscanner.v b/vlib/strings/textscanner/textscanner.v index 97091466ba94b4..7ed693d7119926 100644 --- a/vlib/strings/textscanner/textscanner.v +++ b/vlib/strings/textscanner/textscanner.v @@ -3,19 +3,37 @@ module textscanner // TextScanner simplifies writing small scanners/parsers // by providing safe methods to scan texts character by // character, peek for the next characters, go back, etc. +// TODO: maybe a generic type is more suitable for this. pub struct TextScanner { pub: - input string - ilen int + input_runes []rune + input string + ilen int pub mut: - pos int // current position; pos is *always* kept in [0,ilen] + pos int // current position; pos is *always* kept in [0,ilen] + config TextScannerConfig +} + +@[params] +pub struct TextScannerConfig { +pub mut: + force_rune_mode bool } // new returns a stack allocated instance of TextScanner. -pub fn new(input string) TextScanner { +pub fn new(input string, config TextScannerConfig) TextScanner { + if config.force_rune_mode { + input_runes := input.runes() + return TextScanner{ + input_runes: input_runes + ilen: input_runes.len + config: config + } + } return TextScanner{ - input: input - ilen: input.len + input: input.clone() + ilen: input.len + config: config } } @@ -23,7 +41,11 @@ pub fn new(input string) TextScanner { @[unsafe] pub fn (mut ss TextScanner) free() { unsafe { - ss.input.free() + if ss.config.force_rune_mode { + ss.input_runes.free() + } else { + ss.input.free() + } } } @@ -41,7 +63,11 @@ pub fn (mut ss TextScanner) next() int { if ss.pos < ss.ilen { opos := ss.pos ss.pos++ - return ss.input[opos] + if ss.config.force_rune_mode { + return int(ss.input_runes[opos]) + } else { + return ss.input[opos] + } } return -1 } @@ -59,7 +85,9 @@ pub fn (mut ss TextScanner) skip() { @[inline] pub fn (mut ss TextScanner) skip_n(n int) { ss.pos += n - if ss.pos > ss.ilen { + if ss.pos < 0 { + ss.pos = 0 + } else if ss.pos > ss.ilen { ss.pos = ss.ilen } } @@ -70,7 +98,11 @@ pub fn (mut ss TextScanner) skip_n(n int) { @[direct_array_access; inline] pub fn (ss &TextScanner) peek() int { if ss.pos < ss.ilen { - return ss.input[ss.pos] + if ss.config.force_rune_mode { + return int(ss.input_runes[ss.pos]) + } else { + return ss.input[ss.pos] + } } return -1 } @@ -83,7 +115,11 @@ pub fn (ss &TextScanner) peek() int { @[direct_array_access; inline] pub fn (ss &TextScanner) peek_u8() u8 { if ss.pos < ss.ilen { - return ss.input[ss.pos] + if ss.config.force_rune_mode { + return u8(ss.input_runes[ss.pos]) + } else { + return ss.input[ss.pos] + } } return 0 } @@ -94,10 +130,15 @@ pub fn (ss &TextScanner) peek_u8() u8 { // ts.peek_n(1) == ts.peek() . @[direct_array_access; inline] pub fn (ss &TextScanner) peek_n(n int) int { - if ss.pos + n < ss.ilen { + new_pos := ss.pos + n + if new_pos < 0 || new_pos >= ss.ilen { + return -1 + } + if ss.config.force_rune_mode { + return int(ss.input_runes[ss.pos + n]) + } else { return ss.input[ss.pos + n] } - return -1 } // peek_n_u8 returns the character code from the input text, at position + `n`, @@ -107,10 +148,15 @@ pub fn (ss &TextScanner) peek_n(n int) int { // legitimately contain bytes with value `0`. @[direct_array_access; inline] pub fn (ss &TextScanner) peek_n_u8(n int) u8 { - if ss.pos + n < ss.ilen { + new_pos := ss.pos + n + if new_pos < 0 || new_pos >= ss.ilen { + return 0 + } + if ss.config.force_rune_mode { + return u8(ss.input_runes[ss.pos + n]) + } else { return ss.input[ss.pos + n] } - return 0 } // back goes back one character from the current scanner position. @@ -122,12 +168,12 @@ pub fn (mut ss TextScanner) back() { } // back_n goes back `n` characters from the current scanner position. +@[inline] pub fn (mut ss TextScanner) back_n(n int) { ss.pos -= n if ss.pos < 0 { ss.pos = 0 - } - if ss.pos > ss.ilen { + } else if ss.pos > ss.ilen { ss.pos = ss.ilen } } @@ -148,7 +194,11 @@ pub fn (ss &TextScanner) peek_back() int { pub fn (ss &TextScanner) peek_back_n(n int) int { offset := n + 1 if ss.pos >= offset { - return ss.input[ss.pos - offset] + if ss.config.force_rune_mode { + return int(ss.input_runes[ss.pos - offset]) + } else { + return ss.input[ss.pos - offset] + } } return -1 } @@ -159,7 +209,11 @@ pub fn (ss &TextScanner) peek_back_n(n int) int { @[direct_array_access; inline] pub fn (mut ss TextScanner) current() int { if ss.pos > 0 { - return ss.input[ss.pos - 1] + if ss.config.force_rune_mode { + return int(ss.input_runes[ss.pos - 1]) + } else { + return ss.input[ss.pos - 1] + } } return -1 } @@ -167,6 +221,7 @@ pub fn (mut ss TextScanner) current() int { // reset resets the internal state of the scanner // After calling .reset(), .next() will start reading // again from the start of the input text. +@[inline] pub fn (mut ss TextScanner) reset() { ss.pos = 0 } @@ -175,13 +230,144 @@ pub fn (mut ss TextScanner) reset() { // i.e. after calling .goto_end(), the scanner will be at // the end of the input text. Further .next() calls will // return -1, unless you go back. +@[inline] pub fn (mut ss TextScanner) goto_end() { ss.pos = ss.ilen } // skip_whitespace advances the scanner pass any space characters in the input. +@[inline] pub fn (mut ss TextScanner) skip_whitespace() { for ss.ilen - ss.pos > 0 && ss.peek_u8().is_space() { ss.next() } } + +// next_line advances the scanner’s position to the start of +// the next line, and return the line. +// Returns true if successful, or false if the end of the input +// is reached. +@[direct_array_access] +pub fn (mut ss TextScanner) next_line() (string, bool) { + if ss.pos == ss.ilen { + return '', false + } + start := ss.pos + mut end := ss.ilen + if ss.config.force_rune_mode { + for i in start .. ss.ilen { + if ss.input_runes[i] == `\r` || ss.input_runes[i] == `\n` { + end = i + break + } + } + if ss.input_runes[end] == `\r` { + // check next char is `\n` + if end + 1 < ss.ilen && ss.input_runes[end + 1] == `\n` { + ss.pos = end + 2 + } else { + ss.pos = end + 1 + } + } else { + ss.pos = end + 1 + } + } else { + for i in start .. ss.ilen { + if ss.input[i] == `\r` || ss.input[i] == `\n` { + end = i + break + } + } + if ss.input[end] == `\r` { + // check next char is `\n` + if end + 1 < ss.ilen && ss.input[end + 1] == `\n` { + ss.pos = end + 2 + } else { + ss.pos = end + 1 + } + } else { + ss.pos = end + 1 + } + } + + if end >= ss.ilen { + ss.pos = ss.ilen + if ss.config.force_rune_mode { + return ss.input_runes[start..].string(), false + } else { + return ss.input[start..], false + } + } + if ss.pos > ss.ilen { + ss.pos = ss.ilen + } + if ss.config.force_rune_mode { + return ss.input_runes[start..end].string(), true + } else { + return ss.input[start..end], true + } +} + +// read_until reads characters from the current scanning position +// until a delimiter (from the provided string `delimiters`) is encountered. +// The returned string includes all characters from the starting +// position up to (but ​not​ including) the first encountered +// delimiter. The scanner's position is advanced to the character +// immediately after the delimiter (or to the end of the input if +// no delimiter is found). +@[direct_array_access] +pub fn (mut ss TextScanner) read_until(delimiters string) !string { + if delimiters.len == 0 { + return error('delimiters cannot be empty') + } + if ss.pos >= ss.ilen { + return error('already at EOF') + } + start := ss.pos + mut current_pos := ss.pos + if ss.config.force_rune_mode { + delimiters_runes := delimiters.runes() + for { + if current_pos >= ss.ilen { + break + } + r := ss.input_runes[current_pos] + if r in delimiters_runes { + end := current_pos + ss.pos = end + 1 + return ss.input_runes[start..end].string() + } + current_pos += 1 + } + ss.pos = ss.ilen + return ss.input_runes[start..].string() + } else { + delimiters_bytes := delimiters.bytes() + for { + if current_pos >= ss.ilen { + break + } + r := ss.input[current_pos] + if r in delimiters_bytes { + end := current_pos + ss.pos = end + 1 + return ss.input[start..end] + } + current_pos += 1 + } + ss.pos = ss.ilen + return ss.input[start..] + } +} + +// substr return a sub string of input string from start to end. +pub fn (mut ss TextScanner) substr(start int, end int) string { + if start < 0 || start > ss.ilen || end < 0 || end > ss.ilen || start >= end { + return '' + } + if ss.config.force_rune_mode { + return ss.input_runes[start..end].string() + } else { + return ss.input[start..end] + } +} diff --git a/vlib/strings/textscanner/textscanner_test.v b/vlib/strings/textscanner/textscanner_test.v index c64b5f91772bf0..c36afcc125b20c 100644 --- a/vlib/strings/textscanner/textscanner_test.v +++ b/vlib/strings/textscanner/textscanner_test.v @@ -15,6 +15,21 @@ fn test_remaining() { assert s.remaining() == 3 } +fn test_remaining_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.remaining() == 3 + s.next() + s.next() + assert s.remaining() == 1 + s.next() + assert s.remaining() == 0 + s.next() + s.next() + assert s.remaining() == 0 + s.reset() + assert s.remaining() == 3 +} + fn test_next() { mut s := textscanner.new('abc') assert s.next() == `a` @@ -25,6 +40,16 @@ fn test_next() { assert s.next() == -1 } +fn test_next_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.next() == `v` + assert s.next() == `语` + assert s.next() == `言` + assert s.next() == -1 + assert s.next() == -1 + assert s.next() == -1 +} + fn test_skip() { mut s := textscanner.new('abc') assert s.next() == `a` @@ -42,6 +67,23 @@ fn test_skip() { assert s.peek() == -1 } +fn test_skip_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.next() == `v` + s.skip() + assert s.next() == `言` + assert s.next() == -1 + + s.reset() + assert s.peek() == `v` + s.skip() + assert s.peek() == `语` + s.skip() + assert s.peek() == `言` + s.skip() + assert s.peek() == -1 +} + fn test_skip_n() { mut s := textscanner.new('abc') s.skip_n(2) @@ -64,6 +106,40 @@ fn test_skip_n() { assert s.peek() == `a` s.skip_n(4) assert s.peek() == -1 + + s.reset() + assert s.peek() == `a` + s.skip_n(-3) + assert s.peek() == `a` +} + +fn test_skip_n_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + s.skip_n(2) + assert s.next() == `言` + assert s.next() == -1 + + s.reset() + assert s.peek() == `v` + s.skip_n(2) + assert s.peek() == `言` + s.skip_n(2) + assert s.peek() == -1 + + s.reset() + assert s.peek() == `v` + s.skip_n(3) + assert s.peek() == -1 + + s.reset() + assert s.peek() == `v` + s.skip_n(4) + assert s.peek() == -1 + + s.reset() + assert s.peek() == `v` + s.skip_n(-3) + assert s.peek() == `v` } fn test_peek() { @@ -78,6 +154,18 @@ fn test_peek() { assert s.next() == -1 } +fn test_peek_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.peek() == `v` + assert s.peek() == `v` + assert s.peek() == `v` + + assert s.next() == `v` + assert s.next() == `语` + assert s.next() == `言` + assert s.next() == -1 +} + fn test_peek_n() { mut s := textscanner.new('abc') assert s.peek_n(0) == `a` @@ -92,6 +180,20 @@ fn test_peek_n() { assert s.next() == -1 } +fn test_peek_n_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.peek_n(0) == `v` + assert s.peek_n(1) == `语` + assert s.peek_n(2) == `言` + assert s.peek_n(3) == -1 + assert s.peek_n(4) == -1 + + assert s.next() == `v` + assert s.next() == `语` + assert s.next() == `言` + assert s.next() == -1 +} + fn test_back() { mut s := textscanner.new('abc') assert s.next() == `a` @@ -104,6 +206,18 @@ fn test_back() { assert s.next() == -1 } +fn test_back_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.next() == `v` + s.back() + assert s.next() == `v` + assert s.next() == `语` + s.back() + assert s.next() == `语` + assert s.next() == `言` + assert s.next() == -1 +} + fn test_back_n() { mut s := textscanner.new('abc') assert s.next() == `a` @@ -115,6 +229,17 @@ fn test_back_n() { assert s.next() == `b` } +fn test_back_n_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.next() == `v` + s.back_n(10) + assert s.next() == `v` + assert s.next() == `语` + assert s.next() == `言` + s.back_n(2) + assert s.next() == `语` +} + fn test_peek_back() { mut s := textscanner.new('abc') assert s.next() == `a` @@ -135,6 +260,26 @@ fn test_peek_back() { assert s.peek_back() == `b` } +fn test_peek_back_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.next() == `v` + assert s.next() == `语` + // check that calling .peek_back() multiple times + // does not change the state: + assert s.peek_back() == `v` + assert s.peek_back() == `v` + assert s.peek_back() == `v` + // advance, then peek_back again: + assert s.next() == `言` + assert s.peek_back() == `语` + // peeking before the start: + s.reset() + assert s.peek_back() == -1 + // peeking right at the end: + s.goto_end() + assert s.peek_back() == `语` +} + fn test_peek_back_n() { mut s := textscanner.new('abc') s.goto_end() @@ -145,6 +290,16 @@ fn test_peek_back_n() { assert s.peek_back_n(4) == -1 } +fn test_peek_back_n_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + s.goto_end() + assert s.peek_back_n(0) == `言` + assert s.peek_back_n(1) == `语` + assert s.peek_back_n(2) == `v` + assert s.peek_back_n(3) == -1 + assert s.peek_back_n(4) == -1 +} + fn test_reset() { mut s := textscanner.new('abc') assert s.next() == `a` @@ -155,6 +310,16 @@ fn test_reset() { assert s.next() == `a` } +fn test_reset_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.next() == `v` + s.next() + s.next() + assert s.next() == -1 + s.reset() + assert s.next() == `v` +} + fn test_current() { mut s := textscanner.new('abc') assert s.current() == -1 @@ -178,12 +343,41 @@ fn test_current() { assert s.current() == `a` } +fn test_current_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.current() == -1 + assert s.next() == `v` + assert s.current() == `v` + assert s.current() == `v` + assert s.peek_back() == -1 + assert s.next() == `语` + assert s.current() == `语` + assert s.current() == `语` + assert s.peek_back() == `v` + assert s.next() == `言` + assert s.current() == `言` + assert s.next() == -1 + assert s.current() == `言` + assert s.next() == -1 + assert s.current() == `言` + s.reset() + assert s.current() == -1 + assert s.next() == `v` + assert s.current() == `v` +} + fn test_goto_end() { mut s := textscanner.new('abc') s.goto_end() assert s.current() == `c` } +fn test_goto_end_rune() { + mut s := textscanner.new('v语言', force_rune_mode: true) + s.goto_end() + assert s.current() == `言` +} + fn test_skip_whitespace() { mut s := textscanner.new('abc d \n xyz') assert s.current() == -1 @@ -198,6 +392,20 @@ fn test_skip_whitespace() { assert s.next() == `z` } +fn test_skip_whitespace_rune() { + mut s := textscanner.new('v语言 ♥ \n 大家好', force_rune_mode: true) + assert s.current() == -1 + assert s.next() == `v` + assert s.next() == `语` + assert s.next() == `言` + s.skip_whitespace() + assert s.next() == `♥` + s.skip_whitespace() + assert s.next() == `大` + assert s.next() == `家` + assert s.next() == `好` +} + fn test_peek_u8() { mut s := textscanner.new('abc') assert s.peek_u8() == `a` @@ -206,6 +414,15 @@ fn test_peek_u8() { assert s.peek_u8() == `b` } +fn test_peek_u8_rune() { + // maybe for rune mode, we should not use `peek_u8()` + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.peek_u8() == u8(`v`) + assert !s.peek_u8().is_digit() + assert s.next() == u8(`v`) + assert s.peek_u8() == u8(`语`) +} + fn test_peek_n_u8() { mut s := textscanner.new('abc') assert s.peek_n_u8(0) == `a` @@ -214,3 +431,129 @@ fn test_peek_n_u8() { assert s.peek_n_u8(3) == 0 assert s.peek_n_u8(4) == 0 } + +fn test_peek_n_u8_rune() { + // maybe for rune mode, we should not use `peek_n_u8()` + mut s := textscanner.new('v语言', force_rune_mode: true) + assert s.peek_n_u8(0) == u8(`v`) + assert s.peek_n_u8(1) == u8(`语`) + assert s.peek_n_u8(2) == u8(`言`) + assert s.peek_n_u8(3) == 0 + assert s.peek_n_u8(4) == 0 +} + +fn test_next_line() { + mut s := textscanner.new('abc\r\n123\n\n8') + line1, end1 := s.next_line() + assert line1 == 'abc' + assert end1 == true + + line2, end2 := s.next_line() + assert line2 == '123' + assert end2 == true + + line3, end3 := s.next_line() + assert line3 == '' + assert end3 == true + + line4, end4 := s.next_line() + assert line4 == '8' + assert end4 == false + + line5, end5 := s.next_line() + assert line5 == '' + assert end5 == false +} + +fn test_next_line_rune() { + mut s := textscanner.new('v语言\r\n大家好\n\n♥', force_rune_mode: true) + line1, end1 := s.next_line() + assert line1 == 'v语言' + assert end1 == true + + line2, end2 := s.next_line() + assert line2 == '大家好' + assert end2 == true + + line3, end3 := s.next_line() + assert line3 == '' + assert end3 == true + + line4, end4 := s.next_line() + assert line4 == '♥' + assert end4 == false + + line5, end5 := s.next_line() + assert line5 == '' + assert end5 == false +} + +fn test_read_until() { + mut s := textscanner.new('abc\r\n12|3#') + t1 := s.read_until('|') or { panic(err) } + assert t1 == 'abc\r\n12' + + t2 := s.read_until('#') or { panic(err) } + assert t2 == '3' + t3 := s.read_until('#') or { + assert err.msg() == 'already at EOF' + 'not exist' + } + assert t3 == 'not exist' + + mut ss := textscanner.new('abc\r\n12|3#') + tt1 := ss.read_until('|#') or { panic(err) } + assert tt1 == 'abc\r\n12' + + tt2 := ss.read_until('|#') or { panic(err) } + assert tt2 == '3' + tt3 := ss.read_until('|#') or { + assert err.msg() == 'already at EOF' + 'not exist' + } + assert tt3 == 'not exist' +} + +fn test_read_until_rune() { + mut s := textscanner.new('v语言♥\r\n大家好|♥3#', force_rune_mode: true) + t1 := s.read_until('♥') or { panic(err) } + assert t1 == 'v语言' + + t2 := s.read_until('#') or { panic(err) } + assert t2 == '\r\n大家好|♥3' + t3 := s.read_until('#') or { + assert err.msg() == 'already at EOF' + '已经到结尾了' + } + assert t3 == '已经到结尾了' + + mut ss := textscanner.new('v语言♥\r\n大家好|♥3#', force_rune_mode: true) + tt1 := ss.read_until('♥#') or { panic(err) } + assert tt1 == 'v语言' + + tt2 := ss.read_until('♥#') or { panic(err) } + assert tt2 == '\r\n大家好|' + tt3 := ss.read_until('♥#') or { panic(err) } + assert tt3 == '3' + tt4 := ss.read_until('♥#') or { + assert err.msg() == 'already at EOF' + '已经到结尾了' + } + assert tt4 == '已经到结尾了' +} + +fn test_substr() { + mut s := textscanner.new('abc\r\n12|3#') + assert s.substr(0, 4) == 'abc\r' + assert s.substr(6, 8) == '2|' + assert s.substr(-1, 2) == '' + assert s.substr(2, 500) == '' +} + +fn test_substr_rune() { + mut s := textscanner.new('v语言♥\r\n大家好|♥3#', force_rune_mode: true) + assert s.substr(0, 4) == 'v语言♥' + assert s.substr(6, 8) == '大家' + assert s.substr(-1, 2) == '' + assert s.substr(2, 500) == '' +}