Initial Commit

2021-10-02 15:12:57 -07:00
commit 1ff241a74e
22 changed files with 3000 additions and 0 deletions
--- a/scanner/scanner.go
+++ b/scanner/scanner.go
@@ -0,0 +1,294 @@
+/*
+   AMU: Custom simple markup language
+   Copyright (C) 2021 Arsen Musayelyan
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+// Package scanner provides a scanner and tokenizer for AMU source code
+package scanner
+
+import (
+	"bufio"
+	"bytes"
+	"io"
+	"unicode"
+)
+
+// Token represents a lexer token
+type Token int
+
+const (
+	EOF Token = iota
+	EOL
+	WS
+	WORD
+	HEADING
+	FORMAT
+	PUNCT
+)
+
+var eof rune = 0
+
+// String converts a token into a string.
+func (t Token) String() string {
+	switch t {
+	case EOF:
+		return "EOF"
+	case EOL:
+		return "EOL"
+	case WS:
+		return "WS"
+	case WORD:
+		return "WORD"
+	case HEADING:
+		return "HEADING"
+	case FORMAT:
+		return "FORMAT"
+	case PUNCT:
+		return "PUNCT"
+	default:
+		return "unknown"
+	}
+}
+
+// Scanner implements a lexer for AMU source code.
+type Scanner struct {
+	reader   *bufio.Reader
+}
+
+// New creates a new Scanner.
+func New(r io.Reader) *Scanner {
+	
+	switch r := r.(type) {
+	case *bufio.Reader:
+		return &Scanner{reader: r}
+	case *bufio.ReadWriter:
+		return &Scanner{reader: r.Reader}
+	default:
+		return &Scanner{reader: bufio.NewReader(r)}
+	}
+}
+
+// read reads a single rule from the underlying bufio.Reader
+func (s *Scanner) read() rune {
+	// Read rune from reader
+	char, _, err := s.reader.ReadRune()
+	if err != nil {
+		return eof
+	}
+	return char
+}
+
+// unread unreads the last read rune from
+// the underlying bufio.Reader
+func (s *Scanner) unread() {
+	_ = s.reader.UnreadRune()
+}
+
+// scanHeading attempts to scan a HEADING token
+func (s *Scanner) scanHeading() (Token, string) {
+
+	// Create new buffer for token literal
+	buf := &bytes.Buffer{}
+	// Write first character to buffer
+	buf.WriteRune(s.read())
+
+	for {
+		// Read character
+		char := s.read()
+		if char == eof {
+			break
+		} else if char != '#' && !unicode.IsSpace(char) {
+			// Unread character as this is not a valid heading
+			s.unread()
+			// Return literal as a WORD token
+			return WORD, buf.String()
+		} else if char != '#' {
+			// Unread character as this is the end of the heading literal
+			s.unread()
+			break
+		} else {
+			// Write character to buffer
+			buf.WriteRune(char)
+		}
+	}
+
+	// If level more than 6
+	if buf.Len() > 6 {
+		// Return literal as a WORD token as this is not a valid heading
+		return WORD, buf.String()
+	}
+
+	// Return HEADING token
+	return HEADING, buf.String()
+}
+
+// isEOL checks if char is an end of line character
+func isEOL(char rune) bool {
+	return char == '\n' || char == '\r'
+}
+
+// scanEOL scans an EOL token
+func (s *Scanner) scanEOL() (Token, string) {
+	// Create new buffer for token literal
+	buf := &bytes.Buffer{}
+	// Write first character to buffer
+	buf.WriteRune(s.read())
+
+	for {
+		// Read character
+		char := s.read()
+		if char == eof {
+			break
+		} else if !isEOL(char) {
+			// Unread character as this is the beginning of the next line
+			s.unread()
+			break
+		} else {
+			// Write character to buffer
+			buf.WriteRune(char)
+		}
+	}
+
+	// Return EOL token
+	return EOL, buf.String()
+}
+
+// scanWord scans a WORD token
+func (s *Scanner) scanWord() (Token, string) {
+	// Create new buffer for token literal
+	buf := &bytes.Buffer{}
+	// Write first character to buffer
+	buf.WriteRune(s.read())
+
+	for {
+		// Read character
+		char := s.read()
+		if char == eof {
+			break
+		} else if unicode.IsSpace(char) || unicode.IsPunct(char) {
+			// Unread as this is the end of the word
+			s.unread()
+			break
+		}
+		// Write character to buffer
+		buf.WriteRune(char)
+	}
+
+	// Return WORD token
+	return WORD, buf.String()
+}
+
+// scanWhitespace scans a WS token
+func (s *Scanner) scanWhitespace() (Token, string) {
+	// Create new buffer for token literal
+	buf := &bytes.Buffer{}
+	// Write first character to the buffer
+	buf.WriteRune(s.read())
+
+	for {
+		// Read character
+		char := s.read()
+		if char == eof {
+			break
+		} else if !unicode.IsSpace(char) || isEOL(char) {
+			// Unread as this is the end of the whitespace
+			s.unread()
+			break
+		} else {
+			// Write character to buffer
+			buf.WriteRune(char)
+		}
+	}
+
+	// Return WS token
+	return WS, buf.String()
+}
+
+// isFormatRune checks whether char is a
+// format character
+func isFormatRune(char rune) bool {
+	return char == '*' ||
+	char == '_' ||
+	char == '$' ||
+	char == '`' ||
+	char == '~'
+}
+
+// scanFormat scans a FORMAT token
+func (s *Scanner) scanFormat() (Token, string) {
+	// Store format rule for use later
+	formatRune := s.read()
+	// Create new buffer for token literal
+	buf := &bytes.Buffer{}
+	// Write first character to buffer
+	buf.WriteRune(formatRune)
+
+	for {
+		// Read character
+		char := s.read()
+		if char == eof {
+			break
+		} else if isEOL(char) {
+			// Unread as this is not a valid format
+			s.unread()
+			// Return literal as WORD token
+			return WORD, buf.String()
+		} else if char == formatRune {
+			// Write character to buffer
+			buf.WriteRune(char)
+			// Stop scanning as this is the end of the format
+			break
+		} else {
+			// Write character to buffer
+			buf.WriteRune(char)
+		}
+	}
+
+	// Return FORMAt token
+	return FORMAT, buf.String()
+}
+
+// scanPunct scans a PUNCT token
+func (s *Scanner) scanPunct() (Token, string) {
+	return PUNCT, string(s.read())
+}
+
+// Scan scans a single token from the input
+func (s *Scanner) Scan() (Token, string) {
+	// read character
+	char := s.read()
+
+	// Unread character as it will be
+	// needed by future functions
+	s.unread()
+
+	// Run appropriate scan function and return result
+	if isEOL(char) {
+		return s.scanEOL()
+	} else if unicode.IsSpace(char) {
+		return s.scanWhitespace()
+	} else if char == '#' {
+		return s.scanHeading()
+	} else if isFormatRune(char) {
+		return s.scanFormat()
+	} else if unicode.IsPunct(char) {
+		return s.scanPunct()
+	} else if char != eof {
+		return s.scanWord()
+	} else {
+		return EOF, ""
+	}
+}