amu/scanner/scanner.go

/*
   AMU: Custom simple markup language
   Copyright (C) 2021 Arsen Musayelyan

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

// Package scanner provides a scanner and tokenizer for AMU source code
package scanner

import (
	"bufio"
	"bytes"
	"io"
	"unicode"
)

// Token represents a lexer token
type Token int

const (
	EOF Token = iota
	EOL
	WS
	WORD
	HEADING
	FORMAT
	PUNCT
)

var eof rune = 0

// String converts a token into a string.
func (t Token) String() string {
	switch t {
	case EOF:
		return "EOF"
	case EOL:
		return "EOL"
	case WS:
		return "WS"
	case WORD:
		return "WORD"
	case HEADING:
		return "HEADING"
	case FORMAT:
		return "FORMAT"
	case PUNCT:
		return "PUNCT"
	default:
		return "unknown"
	}
}

// Scanner implements a lexer for AMU source code.
type Scanner struct {
	reader   *bufio.Reader
}

// New creates a new Scanner.
func New(r io.Reader) *Scanner {
	
	switch r := r.(type) {
	case *bufio.Reader:
		return &Scanner{reader: r}
	case *bufio.ReadWriter:
		return &Scanner{reader: r.Reader}
	default:
		return &Scanner{reader: bufio.NewReader(r)}
	}
}

// read reads a single rule from the underlying bufio.Reader
func (s *Scanner) read() rune {
	// Read rune from reader
	char, _, err := s.reader.ReadRune()
	if err != nil {
		return eof
	}
	return char
}

// unread unreads the last read rune from
// the underlying bufio.Reader
func (s *Scanner) unread() {
	_ = s.reader.UnreadRune()
}

// scanHeading attempts to scan a HEADING token
func (s *Scanner) scanHeading() (Token, string) {

	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to buffer
	buf.WriteRune(s.read())

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if char != '#' && !unicode.IsSpace(char) {
			// Unread character as this is not a valid heading
			s.unread()
			// Return literal as a WORD token
			return WORD, buf.String()
		} else if char != '#' {
			// Unread character as this is the end of the heading literal
			s.unread()
			break
		} else {
			// Write character to buffer
			buf.WriteRune(char)
		}
	}

	// If level more than 6
	if buf.Len() > 6 {
		// Return literal as a WORD token as this is not a valid heading
		return WORD, buf.String()
	}

	// Return HEADING token
	return HEADING, buf.String()
}

// isEOL checks if char is an end of line character
func isEOL(char rune) bool {
	return char == '\n' || char == '\r'
}

// scanEOL scans an EOL token
func (s *Scanner) scanEOL() (Token, string) {
	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to buffer
	buf.WriteRune(s.read())

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if !isEOL(char) {
			// Unread character as this is the beginning of the next line
			s.unread()
			break
		} else {
			// Write character to buffer
			buf.WriteRune(char)
		}
	}

	// Return EOL token
	return EOL, buf.String()
}

// scanWord scans a WORD token
func (s *Scanner) scanWord() (Token, string) {
	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to buffer
	buf.WriteRune(s.read())

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if unicode.IsSpace(char) || unicode.IsPunct(char) {
			// Unread as this is the end of the word
			s.unread()
			break
		}
		// Write character to buffer
		buf.WriteRune(char)
	}

	// Return WORD token
	return WORD, buf.String()
}

// scanWhitespace scans a WS token
func (s *Scanner) scanWhitespace() (Token, string) {
	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to the buffer
	buf.WriteRune(s.read())

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if !unicode.IsSpace(char) || isEOL(char) {
			// Unread as this is the end of the whitespace
			s.unread()
			break
		} else {
			// Write character to buffer
			buf.WriteRune(char)
		}
	}

	// Return WS token
	return WS, buf.String()
}

// isFormatRune checks whether char is a
// format character
func isFormatRune(char rune) bool {
	return char == '*' ||
	char == '_' ||
	char == '$' ||
	char == '`' ||
	char == '~'
}

// scanFormat scans a FORMAT token
func (s *Scanner) scanFormat() (Token, string) {
	// Store format rule for use later
	formatRune := s.read()
	// Create new buffer for token literal
	buf := &bytes.Buffer{}
	// Write first character to buffer
	buf.WriteRune(formatRune)

	for {
		// Read character
		char := s.read()
		if char == eof {
			break
		} else if isEOL(char) {
			// Unread as this is not a valid format
			s.unread()
			// Return literal as WORD token
			return WORD, buf.String()
		} else if char == formatRune {
			// Write character to buffer
			buf.WriteRune(char)
			// Stop scanning as this is the end of the format
			break
		} else {
			// Write character to buffer
			buf.WriteRune(char)
		}
	}

	// Return FORMAt token
	return FORMAT, buf.String()
}

// scanPunct scans a PUNCT token
func (s *Scanner) scanPunct() (Token, string) {
	return PUNCT, string(s.read())
}

// Scan scans a single token from the input
func (s *Scanner) Scan() (Token, string) {
	// read character
	char := s.read()

	// Unread character as it will be
	// needed by future functions
	s.unread()

	// Run appropriate scan function and return result
	if isEOL(char) {
		return s.scanEOL()
	} else if unicode.IsSpace(char) {
		return s.scanWhitespace()
	} else if char == '#' {
		return s.scanHeading()
	} else if isFormatRune(char) {
		return s.scanFormat()
	} else if unicode.IsPunct(char) {
		return s.scanPunct()
	} else if char != eof {
		return s.scanWord()
	} else {
		return EOF, ""
	}
}
Initial Commit 2021-10-02 22:12:57 +00:00			`/*`
			`AMU: Custom simple markup language`
			`Copyright (C) 2021 Arsen Musayelyan`

			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program. If not, see <https://www.gnu.org/licenses/>.`
			`*/`

			`// Package scanner provides a scanner and tokenizer for AMU source code`
			`package scanner`

			`import (`
			`"bufio"`
			`"bytes"`
			`"io"`
			`"unicode"`
			`)`

			`// Token represents a lexer token`
			`type Token int`

			`const (`
			`EOF Token = iota`
			`EOL`
			`WS`
			`WORD`
			`HEADING`
			`FORMAT`
			`PUNCT`
			`)`

			`var eof rune = 0`

			`// String converts a token into a string.`
			`func (t Token) String() string {`
			`switch t {`
			`case EOF:`
			`return "EOF"`
			`case EOL:`
			`return "EOL"`
			`case WS:`
			`return "WS"`
			`case WORD:`
			`return "WORD"`
			`case HEADING:`
			`return "HEADING"`
			`case FORMAT:`
			`return "FORMAT"`
			`case PUNCT:`
			`return "PUNCT"`
			`default:`
			`return "unknown"`
			`}`
			`}`

			`// Scanner implements a lexer for AMU source code.`
			`type Scanner struct {`
			`reader *bufio.Reader`
			`}`

			`// New creates a new Scanner.`
			`func New(r io.Reader) *Scanner {`

			`switch r := r.(type) {`
			`case *bufio.Reader:`
			`return &Scanner{reader: r}`
			`case *bufio.ReadWriter:`
			`return &Scanner{reader: r.Reader}`
			`default:`
			`return &Scanner{reader: bufio.NewReader(r)}`
			`}`
			`}`

			`// read reads a single rule from the underlying bufio.Reader`
			`func (s *Scanner) read() rune {`
			`// Read rune from reader`
			`char, _, err := s.reader.ReadRune()`
			`if err != nil {`
			`return eof`
			`}`
			`return char`
			`}`

			`// unread unreads the last read rune from`
			`// the underlying bufio.Reader`
			`func (s *Scanner) unread() {`
			`_ = s.reader.UnreadRune()`
			`}`

			`// scanHeading attempts to scan a HEADING token`
			`func (s *Scanner) scanHeading() (Token, string) {`

			`// Create new buffer for token literal`
			`buf := &bytes.Buffer{}`
			`// Write first character to buffer`
			`buf.WriteRune(s.read())`

			`for {`
			`// Read character`
			`char := s.read()`
			`if char == eof {`
			`break`
			`} else if char != '#' && !unicode.IsSpace(char) {`
			`// Unread character as this is not a valid heading`
			`s.unread()`
			`// Return literal as a WORD token`
			`return WORD, buf.String()`
			`} else if char != '#' {`
			`// Unread character as this is the end of the heading literal`
			`s.unread()`
			`break`
			`} else {`
			`// Write character to buffer`
			`buf.WriteRune(char)`
			`}`
			`}`

			`// If level more than 6`
			`if buf.Len() > 6 {`
			`// Return literal as a WORD token as this is not a valid heading`
			`return WORD, buf.String()`
			`}`

			`// Return HEADING token`
			`return HEADING, buf.String()`
			`}`

			`// isEOL checks if char is an end of line character`
			`func isEOL(char rune) bool {`
			`return char == '\n' \|\| char == '\r'`
			`}`

			`// scanEOL scans an EOL token`
			`func (s *Scanner) scanEOL() (Token, string) {`
			`// Create new buffer for token literal`
			`buf := &bytes.Buffer{}`
			`// Write first character to buffer`
			`buf.WriteRune(s.read())`

			`for {`
			`// Read character`
			`char := s.read()`
			`if char == eof {`
			`break`
			`} else if !isEOL(char) {`
			`// Unread character as this is the beginning of the next line`
			`s.unread()`
			`break`
			`} else {`
			`// Write character to buffer`
			`buf.WriteRune(char)`
			`}`
			`}`

			`// Return EOL token`
			`return EOL, buf.String()`
			`}`

			`// scanWord scans a WORD token`
			`func (s *Scanner) scanWord() (Token, string) {`
			`// Create new buffer for token literal`
			`buf := &bytes.Buffer{}`
			`// Write first character to buffer`
			`buf.WriteRune(s.read())`

			`for {`
			`// Read character`
			`char := s.read()`
			`if char == eof {`
			`break`
			`} else if unicode.IsSpace(char) \|\| unicode.IsPunct(char) {`
			`// Unread as this is the end of the word`
			`s.unread()`
			`break`
			`}`
			`// Write character to buffer`
			`buf.WriteRune(char)`
			`}`

			`// Return WORD token`
			`return WORD, buf.String()`
			`}`

			`// scanWhitespace scans a WS token`
			`func (s *Scanner) scanWhitespace() (Token, string) {`
			`// Create new buffer for token literal`
			`buf := &bytes.Buffer{}`
			`// Write first character to the buffer`
			`buf.WriteRune(s.read())`

			`for {`
			`// Read character`
			`char := s.read()`
			`if char == eof {`
			`break`
			`} else if !unicode.IsSpace(char) \|\| isEOL(char) {`
			`// Unread as this is the end of the whitespace`
			`s.unread()`
			`break`
			`} else {`
			`// Write character to buffer`
			`buf.WriteRune(char)`
			`}`
			`}`

			`// Return WS token`
			`return WS, buf.String()`
			`}`

			`// isFormatRune checks whether char is a`
			`// format character`
			`func isFormatRune(char rune) bool {`
			`return char == '*' \|\|`
			`char == '_' \|\|`
			`char == '$' \|\|`
			char == '`' \|\|
			`char == '~'`
			`}`

			`// scanFormat scans a FORMAT token`
			`func (s *Scanner) scanFormat() (Token, string) {`
			`// Store format rule for use later`
			`formatRune := s.read()`
			`// Create new buffer for token literal`
			`buf := &bytes.Buffer{}`
			`// Write first character to buffer`
			`buf.WriteRune(formatRune)`

			`for {`
			`// Read character`
			`char := s.read()`
			`if char == eof {`
			`break`
			`} else if isEOL(char) {`
			`// Unread as this is not a valid format`
			`s.unread()`
			`// Return literal as WORD token`
			`return WORD, buf.String()`
			`} else if char == formatRune {`
			`// Write character to buffer`
			`buf.WriteRune(char)`
			`// Stop scanning as this is the end of the format`
			`break`
			`} else {`
			`// Write character to buffer`
			`buf.WriteRune(char)`
			`}`
			`}`

			`// Return FORMAt token`
			`return FORMAT, buf.String()`
			`}`

			`// scanPunct scans a PUNCT token`
			`func (s *Scanner) scanPunct() (Token, string) {`
			`return PUNCT, string(s.read())`
			`}`

			`// Scan scans a single token from the input`
			`func (s *Scanner) Scan() (Token, string) {`
			`// read character`
			`char := s.read()`

			`// Unread character as it will be`
			`// needed by future functions`
			`s.unread()`

			`// Run appropriate scan function and return result`
			`if isEOL(char) {`
			`return s.scanEOL()`
			`} else if unicode.IsSpace(char) {`
			`return s.scanWhitespace()`
			`} else if char == '#' {`
			`return s.scanHeading()`
			`} else if isFormatRune(char) {`
			`return s.scanFormat()`
			`} else if unicode.IsPunct(char) {`
			`return s.scanPunct()`
			`} else if char != eof {`
			`return s.scanWord()`
			`} else {`
			`return EOF, ""`
			`}`
			`}`