Initial Commit
This commit is contained in:
294
scanner/scanner.go
Normal file
294
scanner/scanner.go
Normal file
@@ -0,0 +1,294 @@
|
||||
/*
|
||||
AMU: Custom simple markup language
|
||||
Copyright (C) 2021 Arsen Musayelyan
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
// Package scanner provides a scanner and tokenizer for AMU source code
|
||||
package scanner
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"io"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Token represents a lexer token
|
||||
type Token int
|
||||
|
||||
const (
|
||||
EOF Token = iota
|
||||
EOL
|
||||
WS
|
||||
WORD
|
||||
HEADING
|
||||
FORMAT
|
||||
PUNCT
|
||||
)
|
||||
|
||||
var eof rune = 0
|
||||
|
||||
// String converts a token into a string.
|
||||
func (t Token) String() string {
|
||||
switch t {
|
||||
case EOF:
|
||||
return "EOF"
|
||||
case EOL:
|
||||
return "EOL"
|
||||
case WS:
|
||||
return "WS"
|
||||
case WORD:
|
||||
return "WORD"
|
||||
case HEADING:
|
||||
return "HEADING"
|
||||
case FORMAT:
|
||||
return "FORMAT"
|
||||
case PUNCT:
|
||||
return "PUNCT"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// Scanner implements a lexer for AMU source code.
|
||||
type Scanner struct {
|
||||
reader *bufio.Reader
|
||||
}
|
||||
|
||||
// New creates a new Scanner.
|
||||
func New(r io.Reader) *Scanner {
|
||||
|
||||
switch r := r.(type) {
|
||||
case *bufio.Reader:
|
||||
return &Scanner{reader: r}
|
||||
case *bufio.ReadWriter:
|
||||
return &Scanner{reader: r.Reader}
|
||||
default:
|
||||
return &Scanner{reader: bufio.NewReader(r)}
|
||||
}
|
||||
}
|
||||
|
||||
// read reads a single rule from the underlying bufio.Reader
|
||||
func (s *Scanner) read() rune {
|
||||
// Read rune from reader
|
||||
char, _, err := s.reader.ReadRune()
|
||||
if err != nil {
|
||||
return eof
|
||||
}
|
||||
return char
|
||||
}
|
||||
|
||||
// unread unreads the last read rune from
|
||||
// the underlying bufio.Reader
|
||||
func (s *Scanner) unread() {
|
||||
_ = s.reader.UnreadRune()
|
||||
}
|
||||
|
||||
// scanHeading attempts to scan a HEADING token
|
||||
func (s *Scanner) scanHeading() (Token, string) {
|
||||
|
||||
// Create new buffer for token literal
|
||||
buf := &bytes.Buffer{}
|
||||
// Write first character to buffer
|
||||
buf.WriteRune(s.read())
|
||||
|
||||
for {
|
||||
// Read character
|
||||
char := s.read()
|
||||
if char == eof {
|
||||
break
|
||||
} else if char != '#' && !unicode.IsSpace(char) {
|
||||
// Unread character as this is not a valid heading
|
||||
s.unread()
|
||||
// Return literal as a WORD token
|
||||
return WORD, buf.String()
|
||||
} else if char != '#' {
|
||||
// Unread character as this is the end of the heading literal
|
||||
s.unread()
|
||||
break
|
||||
} else {
|
||||
// Write character to buffer
|
||||
buf.WriteRune(char)
|
||||
}
|
||||
}
|
||||
|
||||
// If level more than 6
|
||||
if buf.Len() > 6 {
|
||||
// Return literal as a WORD token as this is not a valid heading
|
||||
return WORD, buf.String()
|
||||
}
|
||||
|
||||
// Return HEADING token
|
||||
return HEADING, buf.String()
|
||||
}
|
||||
|
||||
// isEOL checks if char is an end of line character
|
||||
func isEOL(char rune) bool {
|
||||
return char == '\n' || char == '\r'
|
||||
}
|
||||
|
||||
// scanEOL scans an EOL token
|
||||
func (s *Scanner) scanEOL() (Token, string) {
|
||||
// Create new buffer for token literal
|
||||
buf := &bytes.Buffer{}
|
||||
// Write first character to buffer
|
||||
buf.WriteRune(s.read())
|
||||
|
||||
for {
|
||||
// Read character
|
||||
char := s.read()
|
||||
if char == eof {
|
||||
break
|
||||
} else if !isEOL(char) {
|
||||
// Unread character as this is the beginning of the next line
|
||||
s.unread()
|
||||
break
|
||||
} else {
|
||||
// Write character to buffer
|
||||
buf.WriteRune(char)
|
||||
}
|
||||
}
|
||||
|
||||
// Return EOL token
|
||||
return EOL, buf.String()
|
||||
}
|
||||
|
||||
// scanWord scans a WORD token
|
||||
func (s *Scanner) scanWord() (Token, string) {
|
||||
// Create new buffer for token literal
|
||||
buf := &bytes.Buffer{}
|
||||
// Write first character to buffer
|
||||
buf.WriteRune(s.read())
|
||||
|
||||
for {
|
||||
// Read character
|
||||
char := s.read()
|
||||
if char == eof {
|
||||
break
|
||||
} else if unicode.IsSpace(char) || unicode.IsPunct(char) {
|
||||
// Unread as this is the end of the word
|
||||
s.unread()
|
||||
break
|
||||
}
|
||||
// Write character to buffer
|
||||
buf.WriteRune(char)
|
||||
}
|
||||
|
||||
// Return WORD token
|
||||
return WORD, buf.String()
|
||||
}
|
||||
|
||||
// scanWhitespace scans a WS token
|
||||
func (s *Scanner) scanWhitespace() (Token, string) {
|
||||
// Create new buffer for token literal
|
||||
buf := &bytes.Buffer{}
|
||||
// Write first character to the buffer
|
||||
buf.WriteRune(s.read())
|
||||
|
||||
for {
|
||||
// Read character
|
||||
char := s.read()
|
||||
if char == eof {
|
||||
break
|
||||
} else if !unicode.IsSpace(char) || isEOL(char) {
|
||||
// Unread as this is the end of the whitespace
|
||||
s.unread()
|
||||
break
|
||||
} else {
|
||||
// Write character to buffer
|
||||
buf.WriteRune(char)
|
||||
}
|
||||
}
|
||||
|
||||
// Return WS token
|
||||
return WS, buf.String()
|
||||
}
|
||||
|
||||
// isFormatRune checks whether char is a
|
||||
// format character
|
||||
func isFormatRune(char rune) bool {
|
||||
return char == '*' ||
|
||||
char == '_' ||
|
||||
char == '$' ||
|
||||
char == '`' ||
|
||||
char == '~'
|
||||
}
|
||||
|
||||
// scanFormat scans a FORMAT token
|
||||
func (s *Scanner) scanFormat() (Token, string) {
|
||||
// Store format rule for use later
|
||||
formatRune := s.read()
|
||||
// Create new buffer for token literal
|
||||
buf := &bytes.Buffer{}
|
||||
// Write first character to buffer
|
||||
buf.WriteRune(formatRune)
|
||||
|
||||
for {
|
||||
// Read character
|
||||
char := s.read()
|
||||
if char == eof {
|
||||
break
|
||||
} else if isEOL(char) {
|
||||
// Unread as this is not a valid format
|
||||
s.unread()
|
||||
// Return literal as WORD token
|
||||
return WORD, buf.String()
|
||||
} else if char == formatRune {
|
||||
// Write character to buffer
|
||||
buf.WriteRune(char)
|
||||
// Stop scanning as this is the end of the format
|
||||
break
|
||||
} else {
|
||||
// Write character to buffer
|
||||
buf.WriteRune(char)
|
||||
}
|
||||
}
|
||||
|
||||
// Return FORMAt token
|
||||
return FORMAT, buf.String()
|
||||
}
|
||||
|
||||
// scanPunct scans a PUNCT token
|
||||
func (s *Scanner) scanPunct() (Token, string) {
|
||||
return PUNCT, string(s.read())
|
||||
}
|
||||
|
||||
// Scan scans a single token from the input
|
||||
func (s *Scanner) Scan() (Token, string) {
|
||||
// read character
|
||||
char := s.read()
|
||||
|
||||
// Unread character as it will be
|
||||
// needed by future functions
|
||||
s.unread()
|
||||
|
||||
// Run appropriate scan function and return result
|
||||
if isEOL(char) {
|
||||
return s.scanEOL()
|
||||
} else if unicode.IsSpace(char) {
|
||||
return s.scanWhitespace()
|
||||
} else if char == '#' {
|
||||
return s.scanHeading()
|
||||
} else if isFormatRune(char) {
|
||||
return s.scanFormat()
|
||||
} else if unicode.IsPunct(char) {
|
||||
return s.scanPunct()
|
||||
} else if char != eof {
|
||||
return s.scanWord()
|
||||
} else {
|
||||
return EOF, ""
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user