amu/scanner/scanner.go
2021-10-02 15:12:57 -07:00

295 lines
6.1 KiB
Go

/*
AMU: Custom simple markup language
Copyright (C) 2021 Arsen Musayelyan
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
// Package scanner provides a scanner and tokenizer for AMU source code
package scanner
import (
"bufio"
"bytes"
"io"
"unicode"
)
// Token represents a lexer token
type Token int
const (
EOF Token = iota
EOL
WS
WORD
HEADING
FORMAT
PUNCT
)
var eof rune = 0
// String converts a token into a string.
func (t Token) String() string {
switch t {
case EOF:
return "EOF"
case EOL:
return "EOL"
case WS:
return "WS"
case WORD:
return "WORD"
case HEADING:
return "HEADING"
case FORMAT:
return "FORMAT"
case PUNCT:
return "PUNCT"
default:
return "unknown"
}
}
// Scanner implements a lexer for AMU source code.
type Scanner struct {
reader *bufio.Reader
}
// New creates a new Scanner.
func New(r io.Reader) *Scanner {
switch r := r.(type) {
case *bufio.Reader:
return &Scanner{reader: r}
case *bufio.ReadWriter:
return &Scanner{reader: r.Reader}
default:
return &Scanner{reader: bufio.NewReader(r)}
}
}
// read reads a single rule from the underlying bufio.Reader
func (s *Scanner) read() rune {
// Read rune from reader
char, _, err := s.reader.ReadRune()
if err != nil {
return eof
}
return char
}
// unread unreads the last read rune from
// the underlying bufio.Reader
func (s *Scanner) unread() {
_ = s.reader.UnreadRune()
}
// scanHeading attempts to scan a HEADING token
func (s *Scanner) scanHeading() (Token, string) {
// Create new buffer for token literal
buf := &bytes.Buffer{}
// Write first character to buffer
buf.WriteRune(s.read())
for {
// Read character
char := s.read()
if char == eof {
break
} else if char != '#' && !unicode.IsSpace(char) {
// Unread character as this is not a valid heading
s.unread()
// Return literal as a WORD token
return WORD, buf.String()
} else if char != '#' {
// Unread character as this is the end of the heading literal
s.unread()
break
} else {
// Write character to buffer
buf.WriteRune(char)
}
}
// If level more than 6
if buf.Len() > 6 {
// Return literal as a WORD token as this is not a valid heading
return WORD, buf.String()
}
// Return HEADING token
return HEADING, buf.String()
}
// isEOL checks if char is an end of line character
func isEOL(char rune) bool {
return char == '\n' || char == '\r'
}
// scanEOL scans an EOL token
func (s *Scanner) scanEOL() (Token, string) {
// Create new buffer for token literal
buf := &bytes.Buffer{}
// Write first character to buffer
buf.WriteRune(s.read())
for {
// Read character
char := s.read()
if char == eof {
break
} else if !isEOL(char) {
// Unread character as this is the beginning of the next line
s.unread()
break
} else {
// Write character to buffer
buf.WriteRune(char)
}
}
// Return EOL token
return EOL, buf.String()
}
// scanWord scans a WORD token
func (s *Scanner) scanWord() (Token, string) {
// Create new buffer for token literal
buf := &bytes.Buffer{}
// Write first character to buffer
buf.WriteRune(s.read())
for {
// Read character
char := s.read()
if char == eof {
break
} else if unicode.IsSpace(char) || unicode.IsPunct(char) {
// Unread as this is the end of the word
s.unread()
break
}
// Write character to buffer
buf.WriteRune(char)
}
// Return WORD token
return WORD, buf.String()
}
// scanWhitespace scans a WS token
func (s *Scanner) scanWhitespace() (Token, string) {
// Create new buffer for token literal
buf := &bytes.Buffer{}
// Write first character to the buffer
buf.WriteRune(s.read())
for {
// Read character
char := s.read()
if char == eof {
break
} else if !unicode.IsSpace(char) || isEOL(char) {
// Unread as this is the end of the whitespace
s.unread()
break
} else {
// Write character to buffer
buf.WriteRune(char)
}
}
// Return WS token
return WS, buf.String()
}
// isFormatRune checks whether char is a
// format character
func isFormatRune(char rune) bool {
return char == '*' ||
char == '_' ||
char == '$' ||
char == '`' ||
char == '~'
}
// scanFormat scans a FORMAT token
func (s *Scanner) scanFormat() (Token, string) {
// Store format rule for use later
formatRune := s.read()
// Create new buffer for token literal
buf := &bytes.Buffer{}
// Write first character to buffer
buf.WriteRune(formatRune)
for {
// Read character
char := s.read()
if char == eof {
break
} else if isEOL(char) {
// Unread as this is not a valid format
s.unread()
// Return literal as WORD token
return WORD, buf.String()
} else if char == formatRune {
// Write character to buffer
buf.WriteRune(char)
// Stop scanning as this is the end of the format
break
} else {
// Write character to buffer
buf.WriteRune(char)
}
}
// Return FORMAt token
return FORMAT, buf.String()
}
// scanPunct scans a PUNCT token
func (s *Scanner) scanPunct() (Token, string) {
return PUNCT, string(s.read())
}
// Scan scans a single token from the input
func (s *Scanner) Scan() (Token, string) {
// read character
char := s.read()
// Unread character as it will be
// needed by future functions
s.unread()
// Run appropriate scan function and return result
if isEOL(char) {
return s.scanEOL()
} else if unicode.IsSpace(char) {
return s.scanWhitespace()
} else if char == '#' {
return s.scanHeading()
} else if isFormatRune(char) {
return s.scanFormat()
} else if unicode.IsPunct(char) {
return s.scanPunct()
} else if char != eof {
return s.scanWord()
} else {
return EOF, ""
}
}