Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

strings.textscanner: add rune mode support; add next_line() , read_until(), substr() funcs #23874

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 205 additions & 19 deletions vlib/strings/textscanner/textscanner.v
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,49 @@ module textscanner
// TextScanner simplifies writing small scanners/parsers
// by providing safe methods to scan texts character by
// character, peek for the next characters, go back, etc.
// TODO: maybe a generic type is more suitable for this.
pub struct TextScanner {
pub:
input string
ilen int
input_runes []rune
input string
ilen int
pub mut:
pos int // current position; pos is *always* kept in [0,ilen]
pos int // current position; pos is *always* kept in [0,ilen]
config TextScannerConfig
}

@[params]
pub struct TextScannerConfig {
pub mut:
force_rune_mode bool
}

// new returns a stack allocated instance of TextScanner.
pub fn new(input string) TextScanner {
pub fn new(input string, config TextScannerConfig) TextScanner {
if config.force_rune_mode {
input_runes := input.runes()
return TextScanner{
input_runes: input_runes
ilen: input_runes.len
config: config
}
}
return TextScanner{
input: input
ilen: input.len
input: input.clone()
ilen: input.len
config: config
}
}

// free frees all allocated resources.
@[unsafe]
pub fn (mut ss TextScanner) free() {
unsafe {
ss.input.free()
if ss.config.force_rune_mode {
ss.input_runes.free()
} else {
ss.input.free()
}
}
}

Expand All @@ -41,7 +63,11 @@ pub fn (mut ss TextScanner) next() int {
if ss.pos < ss.ilen {
opos := ss.pos
ss.pos++
return ss.input[opos]
if ss.config.force_rune_mode {
return int(ss.input_runes[opos])
} else {
return ss.input[opos]
}
}
return -1
}
Expand All @@ -59,7 +85,9 @@ pub fn (mut ss TextScanner) skip() {
@[inline]
pub fn (mut ss TextScanner) skip_n(n int) {
ss.pos += n
if ss.pos > ss.ilen {
if ss.pos < 0 {
ss.pos = 0
} else if ss.pos > ss.ilen {
ss.pos = ss.ilen
}
}
Expand All @@ -70,7 +98,11 @@ pub fn (mut ss TextScanner) skip_n(n int) {
@[direct_array_access; inline]
pub fn (ss &TextScanner) peek() int {
if ss.pos < ss.ilen {
return ss.input[ss.pos]
if ss.config.force_rune_mode {
return int(ss.input_runes[ss.pos])
} else {
return ss.input[ss.pos]
}
}
return -1
}
Expand All @@ -83,7 +115,11 @@ pub fn (ss &TextScanner) peek() int {
@[direct_array_access; inline]
pub fn (ss &TextScanner) peek_u8() u8 {
if ss.pos < ss.ilen {
return ss.input[ss.pos]
if ss.config.force_rune_mode {
return u8(ss.input_runes[ss.pos])
} else {
return ss.input[ss.pos]
}
}
return 0
}
Expand All @@ -94,10 +130,15 @@ pub fn (ss &TextScanner) peek_u8() u8 {
// ts.peek_n(1) == ts.peek() .
@[direct_array_access; inline]
pub fn (ss &TextScanner) peek_n(n int) int {
if ss.pos + n < ss.ilen {
new_pos := ss.pos + n
if new_pos < 0 || new_pos >= ss.ilen {
return -1
}
if ss.config.force_rune_mode {
return int(ss.input_runes[ss.pos + n])
} else {
return ss.input[ss.pos + n]
}
return -1
}

// peek_n_u8 returns the character code from the input text, at position + `n`,
Expand All @@ -107,10 +148,15 @@ pub fn (ss &TextScanner) peek_n(n int) int {
// legitimately contain bytes with value `0`.
@[direct_array_access; inline]
pub fn (ss &TextScanner) peek_n_u8(n int) u8 {
if ss.pos + n < ss.ilen {
new_pos := ss.pos + n
if new_pos < 0 || new_pos >= ss.ilen {
return 0
}
if ss.config.force_rune_mode {
return u8(ss.input_runes[ss.pos + n])
} else {
return ss.input[ss.pos + n]
}
return 0
}

// back goes back one character from the current scanner position.
Expand All @@ -122,12 +168,12 @@ pub fn (mut ss TextScanner) back() {
}

// back_n goes back `n` characters from the current scanner position.
@[inline]
pub fn (mut ss TextScanner) back_n(n int) {
ss.pos -= n
if ss.pos < 0 {
ss.pos = 0
}
if ss.pos > ss.ilen {
} else if ss.pos > ss.ilen {
ss.pos = ss.ilen
}
}
Expand All @@ -148,7 +194,11 @@ pub fn (ss &TextScanner) peek_back() int {
pub fn (ss &TextScanner) peek_back_n(n int) int {
offset := n + 1
if ss.pos >= offset {
return ss.input[ss.pos - offset]
if ss.config.force_rune_mode {
return int(ss.input_runes[ss.pos - offset])
} else {
return ss.input[ss.pos - offset]
}
}
return -1
}
Expand All @@ -159,14 +209,19 @@ pub fn (ss &TextScanner) peek_back_n(n int) int {
@[direct_array_access; inline]
pub fn (mut ss TextScanner) current() int {
if ss.pos > 0 {
return ss.input[ss.pos - 1]
if ss.config.force_rune_mode {
return int(ss.input_runes[ss.pos - 1])
} else {
return ss.input[ss.pos - 1]
}
}
return -1
}

// reset resets the internal state of the scanner
// After calling .reset(), .next() will start reading
// again from the start of the input text.
@[inline]
pub fn (mut ss TextScanner) reset() {
ss.pos = 0
}
Expand All @@ -175,13 +230,144 @@ pub fn (mut ss TextScanner) reset() {
// i.e. after calling .goto_end(), the scanner will be at
// the end of the input text. Further .next() calls will
// return -1, unless you go back.
@[inline]
pub fn (mut ss TextScanner) goto_end() {
ss.pos = ss.ilen
}

// skip_whitespace advances the scanner pass any space characters in the input.
@[inline]
pub fn (mut ss TextScanner) skip_whitespace() {
for ss.ilen - ss.pos > 0 && ss.peek_u8().is_space() {
ss.next()
}
}

// next_line advances the scanner’s position to the start of
// the next line, and return the line.
// Returns true if successful, or false if the end of the input
// is reached.
@[direct_array_access]
pub fn (mut ss TextScanner) next_line() (string, bool) {
if ss.pos == ss.ilen {
return '', false
}
start := ss.pos
mut end := ss.ilen
if ss.config.force_rune_mode {
for i in start .. ss.ilen {
if ss.input_runes[i] == `\r` || ss.input_runes[i] == `\n` {
end = i
break
}
}
if ss.input_runes[end] == `\r` {
// check next char is `\n`
if end + 1 < ss.ilen && ss.input_runes[end + 1] == `\n` {
ss.pos = end + 2
} else {
ss.pos = end + 1
}
} else {
ss.pos = end + 1
}
} else {
for i in start .. ss.ilen {
if ss.input[i] == `\r` || ss.input[i] == `\n` {
end = i
break
}
}
if ss.input[end] == `\r` {
// check next char is `\n`
if end + 1 < ss.ilen && ss.input[end + 1] == `\n` {
ss.pos = end + 2
} else {
ss.pos = end + 1
}
} else {
ss.pos = end + 1
}
}

if end >= ss.ilen {
ss.pos = ss.ilen
if ss.config.force_rune_mode {
return ss.input_runes[start..].string(), false
} else {
return ss.input[start..], false
}
}
if ss.pos > ss.ilen {
ss.pos = ss.ilen
}
if ss.config.force_rune_mode {
return ss.input_runes[start..end].string(), true
} else {
return ss.input[start..end], true
}
}

// read_until reads characters from the current scanning position
// until a delimiter (from the provided string `delimiters`) is encountered.
// The returned string includes all characters from the starting
// position up to (but ​not​ including) the first encountered
// delimiter. The scanner's position is advanced to the character
// immediately after the delimiter (or to the end of the input if
// no delimiter is found).
@[direct_array_access]
pub fn (mut ss TextScanner) read_until(delimiters string) !string {
if delimiters.len == 0 {
return error('delimiters cannot be empty')
}
if ss.pos >= ss.ilen {
return error('already at EOF')
}
start := ss.pos
mut current_pos := ss.pos
if ss.config.force_rune_mode {
delimiters_runes := delimiters.runes()
for {
if current_pos >= ss.ilen {
break
}
r := ss.input_runes[current_pos]
if r in delimiters_runes {
end := current_pos
ss.pos = end + 1
return ss.input_runes[start..end].string()
}
current_pos += 1
}
ss.pos = ss.ilen
return ss.input_runes[start..].string()
} else {
delimiters_bytes := delimiters.bytes()
for {
if current_pos >= ss.ilen {
break
}
r := ss.input[current_pos]
if r in delimiters_bytes {
end := current_pos
ss.pos = end + 1
return ss.input[start..end]
}
current_pos += 1
}
ss.pos = ss.ilen
return ss.input[start..]
}
}

// substr return a sub string of input string from start to end.
pub fn (mut ss TextScanner) substr(start int, end int) string {
if start < 0 || start > ss.ilen || end < 0 || end > ss.ilen || start >= end {
return ''
}
if ss.config.force_rune_mode {
return ss.input_runes[start..end].string()
} else {
return ss.input[start..end]
}
}
Loading
Loading