package syntax import ( "bufio" "bytes" "fmt" "io" "strings" "time" "unicode" ) // Token is the type of a lexical token in the query grammar. type Token byte const ( TInvalid = iota // invalid or unknown token TTag // field tag: x.y TString // string value: 'foo bar' TNumber // number: 0, 15.5, 100 TTime // timestamp: TIME yyyy-mm-ddThh:mm:ss([-+]hh:mm|Z) TDate // datestamp: DATE yyyy-mm-dd TAnd // operator: AND TContains // operator: CONTAINS TExists // operator: EXISTS TEq // operator: = TLt // operator: < TLeq // operator: <= TGt // operator: > TGeq // operator: >= // Do not reorder these values without updating the scanner code. ) var tString = [...]string{ TInvalid: "invalid token", TTag: "tag", TString: "string", TNumber: "number", TTime: "timestamp", TDate: "datestamp", TAnd: "AND operator", TContains: "CONTAINS operator", TExists: "EXISTS operator", TEq: "= operator", TLt: "< operator", TLeq: "<= operator", TGt: "> operator", TGeq: ">= operator", } func (t Token) String() string { v := int(t) if v > len(tString) { return "unknown token type" } return tString[v] } const ( // TimeFormat is the format string used for timestamp values. TimeFormat = time.RFC3339 // DateFormat is the format string used for datestamp values. DateFormat = "2006-01-02" ) // Scanner reads lexical tokens of the query language from an input stream. // Each call to Next advances the scanner to the next token, or reports an // error. type Scanner struct { r *bufio.Reader buf bytes.Buffer tok Token err error pos, last, end int } // NewScanner constructs a new scanner that reads from r. func NewScanner(r io.Reader) *Scanner { return &Scanner{r: bufio.NewReader(r)} } // Next advances s to the next token in the input, or reports an error. At the // end of input, Next returns io.EOF. func (s *Scanner) Next() error { s.buf.Reset() s.pos = s.end s.tok = TInvalid s.err = nil for { ch, err := s.rune() if err != nil { return s.fail(err) } if unicode.IsSpace(ch) { s.pos = s.end continue // skip whitespace } if '0' <= ch && ch <= '9' { return s.scanNumber(ch) } else if isTagRune(ch) { return s.scanTagLike(ch) } switch ch { case '\'': return s.scanString(ch) case '<', '>', '=': return s.scanCompare(ch) default: return s.invalid(ch) } } } // Token returns the type of the current input token. func (s *Scanner) Token() Token { return s.tok } // Text returns the text of the current input token. func (s *Scanner) Text() string { return s.buf.String() } // Pos returns the start offset of the current token in the input. func (s *Scanner) Pos() int { return s.pos } // Err returns the last error reported by Next, if any. func (s *Scanner) Err() error { return s.err } // scanNumber scans for numbers with optional fractional parts. // Examples: 0, 1, 3.14 func (s *Scanner) scanNumber(first rune) error { s.buf.WriteRune(first) if err := s.scanWhile(isDigit); err != nil { return err } ch, err := s.rune() if err != nil && err != io.EOF { return err } if ch == '.' { s.buf.WriteRune(ch) if err := s.scanWhile(isDigit); err != nil { return err } } else { s.unrune() } s.tok = TNumber return nil } func (s *Scanner) scanString(first rune) error { // discard opening quote for { ch, err := s.rune() if err != nil { return s.fail(err) } else if ch == first { // discard closing quote s.tok = TString return nil } s.buf.WriteRune(ch) } } func (s *Scanner) scanCompare(first rune) error { s.buf.WriteRune(first) switch first { case '=': s.tok = TEq return nil case '<': s.tok = TLt case '>': s.tok = TGt default: return s.invalid(first) } ch, err := s.rune() if err == io.EOF { return nil // the assigned token is correct } else if err != nil { return s.fail(err) } if ch == '=' { s.buf.WriteRune(ch) s.tok++ // depends on token order return nil } s.unrune() return nil } func (s *Scanner) scanTagLike(first rune) error { s.buf.WriteRune(first) var hasSpace bool for { ch, err := s.rune() if err == io.EOF { break } else if err != nil { return s.fail(err) } if !isTagRune(ch) { hasSpace = ch == ' ' // to check for TIME, DATE break } s.buf.WriteRune(ch) } text := s.buf.String() switch text { case "TIME": if hasSpace { return s.scanTimestamp() } s.tok = TTag case "DATE": if hasSpace { return s.scanDatestamp() } s.tok = TTag case "AND": s.tok = TAnd case "EXISTS": s.tok = TExists case "CONTAINS": s.tok = TContains default: s.tok = TTag } s.unrune() return nil } func (s *Scanner) scanTimestamp() error { s.buf.Reset() // discard "TIME" label if err := s.scanWhile(isTimeRune); err != nil { return err } if ts, err := time.Parse(TimeFormat, s.buf.String()); err != nil { return s.fail(fmt.Errorf("invalid TIME value: %w", err)) } else if y := ts.Year(); y < 1900 || y > 2999 { return s.fail(fmt.Errorf("timestamp year %d out of range", ts.Year())) } s.tok = TTime return nil } func (s *Scanner) scanDatestamp() error { s.buf.Reset() // discard "DATE" label if err := s.scanWhile(isDateRune); err != nil { return err } if ts, err := time.Parse(DateFormat, s.buf.String()); err != nil { return s.fail(fmt.Errorf("invalid DATE value: %w", err)) } else if y := ts.Year(); y < 1900 || y > 2999 { return s.fail(fmt.Errorf("datestamp year %d out of range", ts.Year())) } s.tok = TDate return nil } func (s *Scanner) scanWhile(ok func(rune) bool) error { for { ch, err := s.rune() if err == io.EOF { return nil } else if err != nil { return s.fail(err) } else if !ok(ch) { s.unrune() return nil } s.buf.WriteRune(ch) } } func (s *Scanner) rune() (rune, error) { ch, nb, err := s.r.ReadRune() s.last = nb s.end += nb return ch, err } func (s *Scanner) unrune() { _ = s.r.UnreadRune() s.end -= s.last } func (s *Scanner) fail(err error) error { s.err = err return err } func (s *Scanner) invalid(ch rune) error { return s.fail(fmt.Errorf("invalid input %c at offset %d", ch, s.end)) } func isDigit(r rune) bool { return '0' <= r && r <= '9' } func isTagRune(r rune) bool { return r == '.' || r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) } func isTimeRune(r rune) bool { return strings.ContainsRune("-T:+Z", r) || isDigit(r) } func isDateRune(r rune) bool { return isDigit(r) || r == '-' }