amu/scanner/scanner.go

/*
   AMU: Custom simple markup language
   Copyright (C) 2021 Arsen Musayelyan

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

// Package scanner provides a scanner and tokenizer for AMU source code
package scanner

import (
	"bufio"
	"bytes"
	"io"
	"unicode"
)

// Token represents a lexer token
type Token int

const (
	EOF Token = iota
	EOL
	WS
	WORD
	HEADING
	FORMAT
	PUNCT
)

var eof rune = 0

// String converts a token into a string.
func (t Token) String() string {
	switch t {
	case EOF:
		return "EOF"
	case EOL:
		return "EOL"
	case WS:
		return "WS"
	case WORD:
		return "WORD"
	case HEADING:
		return "HEADING"
	case FORMAT:
		return "FORMAT"
	case PUNCT:
		return "PUNCT"
	default:
		return "unknown"
	}
}

// Scanner implements a lexer for AMU source code.
type Scanner struct {
	reader   *bufio.Reader
}

// New creates a new Scanner.
func New(r io.Reader) *Scanner {

	switch r := r.(type) {
	case *bufio.Reader:
		return &Scanner{reader: r}
	case *bufio.ReadWriter:
		return &Scanner{reader: r.Reader}
	default:
		return &Scanner{reader: bufio.NewReader(r)}
	}
}

// read reads a single rule from the underlying bufio.Reader
func (s *Scanner) read() rune {
	// Read rune from reader
	char, _, err := s.reader.ReadRune()
	if err != nil {
		return eof
	}
	return char
}

// unread unreads the last read rune from
// the underlying bufio.Reader
func (s *Scanner) unread() {
	_ = s.reader.UnreadRune()
}

// scanHeading attempts to scan a HEADING token
func (s *Scanner) scanHeading() (Token, string) {

	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to buffer
	buf.WriteRune(s.read())

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if char != '#' && !unicode.IsSpace(char) {
			// Unread character as this is not a valid heading
			s.unread()
			// Return literal as a WORD token
			return WORD, buf.String()
		} else if char != '#' {
			// Unread character as this is the end of the heading literal
			s.unread()
			break
		} else {
			// Write character to buffer
			buf.WriteRune(char)
		}
	}

	// If level more than 6
	if buf.Len() > 6 {
		// Return literal as a WORD token as this is not a valid heading
		return WORD, buf.String()
	}

	// Return HEADING token
	return HEADING, buf.String()
}

// isEOL checks if char is an end of line character
func isEOL(char rune) bool {
	return char == '\n' || char == '\r'
}

// scanEOL scans an EOL token
func (s *Scanner) scanEOL() (Token, string) {
	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to buffer
	buf.WriteRune(s.read())

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if !isEOL(char) {
			// Unread character as this is the beginning of the next line
			s.unread()
			break
		} else {
			// Write character to buffer
			buf.WriteRune(char)
		}
	}

	// Return EOL token
	return EOL, buf.String()
}

// scanWord scans a WORD token
func (s *Scanner) scanWord() (Token, string) {
	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to buffer
	buf.WriteRune(s.read())

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if unicode.IsSpace(char) || unicode.IsPunct(char) {
			// Unread as this is the end of the word
			s.unread()
			break
		}
		// Write character to buffer
		buf.WriteRune(char)
	}

	// Return WORD token
	return WORD, buf.String()
}

// scanWhitespace scans a WS token
func (s *Scanner) scanWhitespace() (Token, string) {
	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to the buffer
	buf.WriteRune(s.read())

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if !unicode.IsSpace(char) || isEOL(char) {
			// Unread as this is the end of the whitespace
			s.unread()
			break
		} else {
			// Write character to buffer
			buf.WriteRune(char)
		}
	}

	// Return WS token
	return WS, buf.String()
}

// isFormatRune checks whether char is a
// format character
func isFormatRune(char rune) bool {
	return char == '*' ||
	char == '_' ||
	char == '$' ||
	char == '`' ||
	char == '~'
}

// scanFormat scans a FORMAT token
func (s *Scanner) scanFormat() (Token, string) {
	// Store format rule for use later
	formatRune := s.read()
	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to buffer
	buf.WriteRune(formatRune)

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if isEOL(char) {
			// Unread as this is not a valid format
			s.unread()
			// Return literal as WORD token
			return WORD, buf.String()
		} else if char == formatRune {
			// Write character to buffer
			buf.WriteRune(char)
			// Stop scanning as this is the end of the format
			break
		} else {
			// Write character to buffer
			buf.WriteRune(char)
		}
	}

	// Return FORMAt token
	return FORMAT, buf.String()
}

// scanPunct scans a PUNCT token
func (s *Scanner) scanPunct() (Token, string) {
	return PUNCT, string(s.read())
}

// Scan scans a single token from the input
func (s *Scanner) Scan() (Token, string) {
	// read character
	char := s.read()

	// Unread character as it will be
	// needed by future functions
	s.unread()

	// Run appropriate scan function and return result
	if isEOL(char) {
		return s.scanEOL()
	} else if unicode.IsSpace(char) {
		return s.scanWhitespace()
	} else if char == '#' {
		return s.scanHeading()
	} else if isFormatRune(char) {
		return s.scanFormat()
	} else if unicode.IsPunct(char) {
		return s.scanPunct()
	} else if char != eof {
		return s.scanWord()
	} else {
		return EOF, ""
	}
}