Creating my own Compiler - Part 1

Some random weekend research led me to look into how a compiler works and the typical approaches. I have some interest in creating a compiler for some data models in a protobuf like way, but not for messages.

Desired Model File Example

The goal is to largely use c-style syntax, but provide a means to specify additional attributes. I originally thought about json, but was inspired by some of the protobuf alternatives.

enum Enum1 {
  Unknown = 0 [display = "Unknown"];
  Uninstalled = 1;
  Installed = 2;
}

struct Type0 {
  bool active [display = "Active"];
}

struct Type1 {
  int32_t var1 [display = "Var 1", units = "Hz"];
  Type0 var2 [display = "Var 2", group = "Group 1"];
  bool[32] array1;
  int32_t bitmask [monitor = false];
}

Prototype 1 Lexer

I was looking at some open-source compilers / lexers and some python modules, but decided to do it from scratch with minimal dependencies. I also opted for state machine-based approach rather than trying to use regex too much.

There is likely some optimization and general cleanup that can be done, but seems to be working as expected. In the end, it allows iteration through the available tokens.

import sys
import enum
import re


class Lexer:
    STATE_UNKNOWN = 0
    STATE_NORMAL = 1
    STATE_QUOTED = 2
    STATE_ELEMENT = 3
    STATE_LINECOMMENT = 4
    STATE_BLOCKCOMMENT = 5

    def __init__(self):
        self.data = None
        self.index = 0
        self.total_length = 0
        self.keywords = [
            'enum',
            'struct'
        ]

        self.special_characters = [
            '=', '[', ']', '{', '}', ';', ','
        ]

        self.operators = {
            '=': TokenType.OP_ASSIGN,
            '[': TokenType.OP_LEFT_BRACKET,
            ']': TokenType.OP_RIGHT_BRACKET,
            ';': TokenType.OP_END_OF_STATEMENT,
            '{': TokenType.OP_LEFT_BRACE,
            '}': TokenType.OP_RIGHT_BRACE,
            ',': TokenType.OP_COMMA
        }

        self.state = self.STATE_UNKNOWN
        self.hex_regex = re.compile('^0[xX][0-9a-fA-F]+$')
        self.binary_regex = re.compile('^0[bB][0-1]+$')
        self.number_regex = re.compile('^\\d+\\.?\\d*$')

    def initialize(self, filename):
        self.data = None
        self.state = self.STATE_UNKNOWN
        self.index = 0
        self.total_length = 0

        with open(filename) as f:
            self.data = f.read()
            self.total_length = len(self.data)
            self.state = self.STATE_NORMAL

    def get_next_character(self):
        result = None
        if self.index < self.total_length:
            result = self.data[self.index]
            self.index += 1

        return result

    def push_back_character(self):
        if self.index > 0:
            self.index -= 1

    def peek_next_character(self):
        result = None
        if self.index < self.total_length:
            result = self.data[self.index]

        return result

    def get_token(self):
        result = None
        value = ''

        while True:
            c = self.get_next_character()

            if self.state == self.STATE_QUOTED:
                if c:
                    if c == '"':
                        # end of string; add the token
                        self.state = self.STATE_NORMAL
                        result = Token(value, TokenType.CONST_STRING)
                        break
                    else:
                        value += c
                else:
                    # no further characters
                    raise Exception('syntax error: string missing end quote')

            elif self.state == self.STATE_NORMAL:
                if c:
                    next_char = self.peek_next_character()
                    if c.isspace():
                        # ignore normal whitespace
                        pass
                    elif c == '/' and next_char == '/':
                        self.state = self.STATE_LINECOMMENT
                        value = ''
                    elif c == '/' and next_char == '*':
                        self.state = self.STATE_BLOCKCOMMENT
                        value = ''
                    elif c == '#':
                        self.state = self.STATE_LINECOMMENT
                        value = ''
                    elif c in self.special_characters:
                        result = self.evaluate_operator_value(c)
                        self.state = self.STATE_NORMAL
                        break
                    else:
                        value = c
                        self.state = self.STATE_ELEMENT
                else:
                    # no further characters... we are done
                    break

            elif self.state == self.STATE_ELEMENT:
                if c:
                    if c.isspace():
                        # end element
                        self.state = self.STATE_NORMAL
                        if value:
                            result = self.evaluate_token_value(value)
                            break
                    elif c in self.special_characters:
                        # end element
                        self.state = self.STATE_NORMAL
                        self.push_back_character()
                        if value:
                            result = self.evaluate_token_value(value)
                            break
                    else:
                        value += c
                else:
                    # no further characters... end element
                    result = self.evaluate_token_value(value)
                    break

            elif self.state == self.STATE_LINECOMMENT:
                if c:
                    if c == '\n' :
                        self.state = self.STATE_NORMAL
                        value = ''
                else:
                    break

            elif self.state == self.STATE_BLOCKCOMMENT:
                if c:
                    next_char = self.peek_next_character()
                    if c == '*' and next_char == '/':
                        self.get_next_character()
                        self.state = self.STATE_NORMAL
                        value = ''
                else:
                    break

            else:
                raise Exception('unknown state')

        return result

    def evaluate_token_value(self, value):
        if value:
            if value in self.keywords:
                result = Token(value, TokenType.KEYWORD)
            elif value in self.operators:
                result = Token(value, self.operators[value])
            else:
                if self.is_numeric(value):
                    result = Token(value, TokenType.CONST_NUMERIC)
                else:
                    result = Token(value, TokenType.VALUE)
        else:
            raise Exception('invalid token value')

        return result

    def evaluate_operator_value(self, value):
        if value:
            if value in self.operators:
                result = Token(value, self.operators[value])
            else:
                raise Exception('invalid operator')
        else:
            raise Exception('invalid token')

        return result

    def is_numeric(self, value):
        return self.number_regex.match(value) or self.hex_regex.match(value) or self.binary_regex.match(value)


class Token:
    def __init__(self, token_text, token_type):
        self.text = token_text   # The token's actual text. Used for identifiers, strings, and numbers.
        self.kind = token_type   # The TokenType that this token is classified as.


# TokenType is our enum for all the types of tokens.
class TokenType(enum.Enum):
    EOF = -1
    KEYWORD = 0
    CONST_STRING = 1
    CONST_NUMERIC = 2
    VALUE = 3
    OP_ASSIGN = 100
    OP_LEFT_BRACKET = 101
    OP_RIGHT_BRACKET = 102
    OP_END_OF_STATEMENT = 103
    OP_LEFT_BRACE = 104,
    OP_RIGHT_BRACE = 105
    OP_COMMA = 106

Test Application

from generator import Lexer


def cli_main():
    lexer = Lexer()
    lexer.initialize("D:\\Projects\\lexer\\test1.dmod")

    while True:
        token = lexer.get_token()
        if token:
            print(f'text = "{token.text}", kind = "{token.kind}"')
        else:
            break


if __name__ == '__main__':
    cli_main()

Text Output

text = "enum", kind = "TokenType.KEYWORD"
text = "Enum1", kind = "TokenType.VALUE"
text = "{", kind = "TokenType.OP_LEFT_BRACE"
text = "Unknown", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = "0", kind = "TokenType.CONST_NUMERIC"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "display", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Unknown"", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "Uninstalled", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = "1", kind = "TokenType.CONST_NUMERIC"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "Installed", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = "2", kind = "TokenType.CONST_NUMERIC"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "}", kind = "TokenType.OP_RIGHT_BRACE"
text = "struct", kind = "TokenType.KEYWORD"
text = "Type0", kind = "TokenType.VALUE"
text = "{", kind = "TokenType.OP_LEFT_BRACE"
text = "bool", kind = "TokenType.VALUE"
text = "active", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "display", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Active"", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "}", kind = "TokenType.OP_RIGHT_BRACE"
text = "struct", kind = "TokenType.KEYWORD"
text = "Type1", kind = "TokenType.VALUE"
text = "{", kind = "TokenType.OP_LEFT_BRACE"
text = "int32_t", kind = "TokenType.VALUE"
text = "var1", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "display", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Var", kind = "TokenType.VALUE"
text = "1"", kind = "TokenType.VALUE"
text = ",", kind = "TokenType.OP_COMMA"
text = "units", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Hz"", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "Type0", kind = "TokenType.VALUE"
text = "var2", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "display", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Var", kind = "TokenType.VALUE"
text = "2"", kind = "TokenType.VALUE"
text = ",", kind = "TokenType.OP_COMMA"
text = "group", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = ""Group", kind = "TokenType.VALUE"
text = "1"", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "bool", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "32", kind = "TokenType.CONST_NUMERIC"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = "array1", kind = "TokenType.VALUE"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "int32_t", kind = "TokenType.VALUE"
text = "bitmask", kind = "TokenType.VALUE"
text = "[", kind = "TokenType.OP_LEFT_BRACKET"
text = "monitor", kind = "TokenType.VALUE"
text = "=", kind = "TokenType.OP_ASSIGN"
text = "false", kind = "TokenType.VALUE"
text = "]", kind = "TokenType.OP_RIGHT_BRACKET"
text = ";", kind = "TokenType.OP_END_OF_STATEMENT"
text = "}", kind = "TokenType.OP_RIGHT_BRACE"

Process finished with exit code 0

All the code may be found here: https://github.com/ericjameszimmerman/simplecompiler