295 lines
6.1 KiB
Go
295 lines
6.1 KiB
Go
|
/*
|
||
|
AMU: Custom simple markup language
|
||
|
Copyright (C) 2021 Arsen Musayelyan
|
||
|
|
||
|
This program is free software: you can redistribute it and/or modify
|
||
|
it under the terms of the GNU General Public License as published by
|
||
|
the Free Software Foundation, either version 3 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU General Public License
|
||
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
// Package scanner provides a scanner and tokenizer for AMU source code
|
||
|
package scanner
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"bytes"
|
||
|
"io"
|
||
|
"unicode"
|
||
|
)
|
||
|
|
||
|
// Token represents a lexer token
|
||
|
type Token int
|
||
|
|
||
|
const (
|
||
|
EOF Token = iota
|
||
|
EOL
|
||
|
WS
|
||
|
WORD
|
||
|
HEADING
|
||
|
FORMAT
|
||
|
PUNCT
|
||
|
)
|
||
|
|
||
|
var eof rune = 0
|
||
|
|
||
|
// String converts a token into a string.
|
||
|
func (t Token) String() string {
|
||
|
switch t {
|
||
|
case EOF:
|
||
|
return "EOF"
|
||
|
case EOL:
|
||
|
return "EOL"
|
||
|
case WS:
|
||
|
return "WS"
|
||
|
case WORD:
|
||
|
return "WORD"
|
||
|
case HEADING:
|
||
|
return "HEADING"
|
||
|
case FORMAT:
|
||
|
return "FORMAT"
|
||
|
case PUNCT:
|
||
|
return "PUNCT"
|
||
|
default:
|
||
|
return "unknown"
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Scanner implements a lexer for AMU source code.
|
||
|
type Scanner struct {
|
||
|
reader *bufio.Reader
|
||
|
}
|
||
|
|
||
|
// New creates a new Scanner.
|
||
|
func New(r io.Reader) *Scanner {
|
||
|
|
||
|
switch r := r.(type) {
|
||
|
case *bufio.Reader:
|
||
|
return &Scanner{reader: r}
|
||
|
case *bufio.ReadWriter:
|
||
|
return &Scanner{reader: r.Reader}
|
||
|
default:
|
||
|
return &Scanner{reader: bufio.NewReader(r)}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// read reads a single rule from the underlying bufio.Reader
|
||
|
func (s *Scanner) read() rune {
|
||
|
// Read rune from reader
|
||
|
char, _, err := s.reader.ReadRune()
|
||
|
if err != nil {
|
||
|
return eof
|
||
|
}
|
||
|
return char
|
||
|
}
|
||
|
|
||
|
// unread unreads the last read rune from
|
||
|
// the underlying bufio.Reader
|
||
|
func (s *Scanner) unread() {
|
||
|
_ = s.reader.UnreadRune()
|
||
|
}
|
||
|
|
||
|
// scanHeading attempts to scan a HEADING token
|
||
|
func (s *Scanner) scanHeading() (Token, string) {
|
||
|
|
||
|
// Create new buffer for token literal
|
||
|
buf := &bytes.Buffer{}
|
||
|
// Write first character to buffer
|
||
|
buf.WriteRune(s.read())
|
||
|
|
||
|
for {
|
||
|
// Read character
|
||
|
char := s.read()
|
||
|
if char == eof {
|
||
|
break
|
||
|
} else if char != '#' && !unicode.IsSpace(char) {
|
||
|
// Unread character as this is not a valid heading
|
||
|
s.unread()
|
||
|
// Return literal as a WORD token
|
||
|
return WORD, buf.String()
|
||
|
} else if char != '#' {
|
||
|
// Unread character as this is the end of the heading literal
|
||
|
s.unread()
|
||
|
break
|
||
|
} else {
|
||
|
// Write character to buffer
|
||
|
buf.WriteRune(char)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// If level more than 6
|
||
|
if buf.Len() > 6 {
|
||
|
// Return literal as a WORD token as this is not a valid heading
|
||
|
return WORD, buf.String()
|
||
|
}
|
||
|
|
||
|
// Return HEADING token
|
||
|
return HEADING, buf.String()
|
||
|
}
|
||
|
|
||
|
// isEOL checks if char is an end of line character
|
||
|
func isEOL(char rune) bool {
|
||
|
return char == '\n' || char == '\r'
|
||
|
}
|
||
|
|
||
|
// scanEOL scans an EOL token
|
||
|
func (s *Scanner) scanEOL() (Token, string) {
|
||
|
// Create new buffer for token literal
|
||
|
buf := &bytes.Buffer{}
|
||
|
// Write first character to buffer
|
||
|
buf.WriteRune(s.read())
|
||
|
|
||
|
for {
|
||
|
// Read character
|
||
|
char := s.read()
|
||
|
if char == eof {
|
||
|
break
|
||
|
} else if !isEOL(char) {
|
||
|
// Unread character as this is the beginning of the next line
|
||
|
s.unread()
|
||
|
break
|
||
|
} else {
|
||
|
// Write character to buffer
|
||
|
buf.WriteRune(char)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Return EOL token
|
||
|
return EOL, buf.String()
|
||
|
}
|
||
|
|
||
|
// scanWord scans a WORD token
|
||
|
func (s *Scanner) scanWord() (Token, string) {
|
||
|
// Create new buffer for token literal
|
||
|
buf := &bytes.Buffer{}
|
||
|
// Write first character to buffer
|
||
|
buf.WriteRune(s.read())
|
||
|
|
||
|
for {
|
||
|
// Read character
|
||
|
char := s.read()
|
||
|
if char == eof {
|
||
|
break
|
||
|
} else if unicode.IsSpace(char) || unicode.IsPunct(char) {
|
||
|
// Unread as this is the end of the word
|
||
|
s.unread()
|
||
|
break
|
||
|
}
|
||
|
// Write character to buffer
|
||
|
buf.WriteRune(char)
|
||
|
}
|
||
|
|
||
|
// Return WORD token
|
||
|
return WORD, buf.String()
|
||
|
}
|
||
|
|
||
|
// scanWhitespace scans a WS token
|
||
|
func (s *Scanner) scanWhitespace() (Token, string) {
|
||
|
// Create new buffer for token literal
|
||
|
buf := &bytes.Buffer{}
|
||
|
// Write first character to the buffer
|
||
|
buf.WriteRune(s.read())
|
||
|
|
||
|
for {
|
||
|
// Read character
|
||
|
char := s.read()
|
||
|
if char == eof {
|
||
|
break
|
||
|
} else if !unicode.IsSpace(char) || isEOL(char) {
|
||
|
// Unread as this is the end of the whitespace
|
||
|
s.unread()
|
||
|
break
|
||
|
} else {
|
||
|
// Write character to buffer
|
||
|
buf.WriteRune(char)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Return WS token
|
||
|
return WS, buf.String()
|
||
|
}
|
||
|
|
||
|
// isFormatRune checks whether char is a
|
||
|
// format character
|
||
|
func isFormatRune(char rune) bool {
|
||
|
return char == '*' ||
|
||
|
char == '_' ||
|
||
|
char == '$' ||
|
||
|
char == '`' ||
|
||
|
char == '~'
|
||
|
}
|
||
|
|
||
|
// scanFormat scans a FORMAT token
|
||
|
func (s *Scanner) scanFormat() (Token, string) {
|
||
|
// Store format rule for use later
|
||
|
formatRune := s.read()
|
||
|
// Create new buffer for token literal
|
||
|
buf := &bytes.Buffer{}
|
||
|
// Write first character to buffer
|
||
|
buf.WriteRune(formatRune)
|
||
|
|
||
|
for {
|
||
|
// Read character
|
||
|
char := s.read()
|
||
|
if char == eof {
|
||
|
break
|
||
|
} else if isEOL(char) {
|
||
|
// Unread as this is not a valid format
|
||
|
s.unread()
|
||
|
// Return literal as WORD token
|
||
|
return WORD, buf.String()
|
||
|
} else if char == formatRune {
|
||
|
// Write character to buffer
|
||
|
buf.WriteRune(char)
|
||
|
// Stop scanning as this is the end of the format
|
||
|
break
|
||
|
} else {
|
||
|
// Write character to buffer
|
||
|
buf.WriteRune(char)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Return FORMAt token
|
||
|
return FORMAT, buf.String()
|
||
|
}
|
||
|
|
||
|
// scanPunct scans a PUNCT token
|
||
|
func (s *Scanner) scanPunct() (Token, string) {
|
||
|
return PUNCT, string(s.read())
|
||
|
}
|
||
|
|
||
|
// Scan scans a single token from the input
|
||
|
func (s *Scanner) Scan() (Token, string) {
|
||
|
// read character
|
||
|
char := s.read()
|
||
|
|
||
|
// Unread character as it will be
|
||
|
// needed by future functions
|
||
|
s.unread()
|
||
|
|
||
|
// Run appropriate scan function and return result
|
||
|
if isEOL(char) {
|
||
|
return s.scanEOL()
|
||
|
} else if unicode.IsSpace(char) {
|
||
|
return s.scanWhitespace()
|
||
|
} else if char == '#' {
|
||
|
return s.scanHeading()
|
||
|
} else if isFormatRune(char) {
|
||
|
return s.scanFormat()
|
||
|
} else if unicode.IsPunct(char) {
|
||
|
return s.scanPunct()
|
||
|
} else if char != eof {
|
||
|
return s.scanWord()
|
||
|
} else {
|
||
|
return EOF, ""
|
||
|
}
|
||
|
}
|