Programming Projects Python

Let's write a compiler

~25 mins read

Both an interpreter and a compiler translate one language to another, normally a higher-level one to a lower-level one.

In the end, every program must be compiled down to the assembly, CPU instructions, and finally electric pulses on transistors of logic gates

In the An interpreter in Python post, we built a mini interpreter, visiting the nodes and executing them, the flow looked like

Source Code -> TOKENIZER -> tokens -> PARSER -> Abstract Syntax Tree -> EVALUATOR -> Executes in another language like C -> COMPILER -> machine code

For a compiler, it would look like

Source Code -> TOKENIZER -> tokens -> PARSER -> Abstract Syntax Tree -> COMPILER -> byte code -> VM -> COMPILER for the VM language -> Assembler -> machine code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from enum import Enum, auto
from dataclasses import dataclass

# Enum to represent different token types
class TokenType(Enum):
    ILLEGAL = "ILLEGAL"
    EOF = "EOF"

    # Identifiers + literals
    IDENT = "IDENT"  # add, foobar, x, y, ...
    INT = "INT"      # 1343456
    STRING = "STRING"  # "foobar"

    # Operators
    ASSIGN = "="
    PLUS = "+"
    MINUS = "-"
    BANG = "!"
    ASTERISK = "*"
    SLASH = "/"

    LT = "<"
    GT = ">"

    EQ = "=="
    NOT_EQ = "!="

    # Delimiters
    COMMA = ","
    SEMICOLON = ";"
    COLON = ":"

    LPAREN = "("
    RPAREN = ")"
    LBRACE = "{"
    RBRACE = "}"
    LBRACKET = "["
    RBRACKET = "]"

    # Keywords
    FUNCTION = "FUNCTION"
    LET = "LET"
    TRUE = "TRUE"
    FALSE = "FALSE"
    IF = "IF"
    ELSE = "ELSE"
    RETURN = "RETURN"

@dataclass
class Token:
    Type: TokenType
    Literal: str

keywords = {
    "fn": TokenType.FUNCTION,
    "let": TokenType.LET,
    "true": TokenType.TRUE,
    "false": TokenType.FALSE,
    "if": TokenType.IF,
    "else": TokenType.ELSE,
    "return": TokenType.RETURN,
}

single_char_tokens = {
    '=': TokenType.ASSIGN,
    '+': TokenType.PLUS,
    '-': TokenType.MINUS,
    '!': TokenType.BANG,
    '/': TokenType.SLASH,
    '*': TokenType.ASTERISK,
    '<': TokenType.LT,
    '>': TokenType.GT,
    ';': TokenType.SEMICOLON,
    ':': TokenType.COLON,
    ',': TokenType.COMMA,
    '{': TokenType.LBRACE,
    '}': TokenType.RBRACE,
    '(': TokenType.LPAREN,
    ')': TokenType.RPAREN,
}

def lookup_ident(ident: str) -> TokenType:
    return keywords.get(ident, TokenType.IDENT)

Now we have the tokens, let’s define what we expect from the tokenizer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from tokenizer import Tokenizer
from token import TokenType

def test_next_token():
    input_text = """
    let five = 5;
    let ten = 10;

    let add = fn(x, y) {
        x + y;
    };

    let result = add(five, ten);
    !-/*5;
    5 < 10 > 5;

    if (5 < 10) {
        return true;
    } else {
        return false;
    }

    10 == 10;
    10 != 9;
    "foobar"
    "foo bar"
    [1, 2];
    {"foo": "bar"}
    """

    tests = [
        (TokenType.LET, "let"),
        (TokenType.IDENT, "five"),
        (TokenType.ASSIGN, "="),
        (TokenType.INT, "5"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.LET, "let"),
        (TokenType.IDENT, "ten"),
        (TokenType.ASSIGN, "="),
        (TokenType.INT, "10"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.LET, "let"),
        (TokenType.IDENT, "add"),
        (TokenType.ASSIGN, "="),
        (TokenType.FUNCTION, "fn"),
        (TokenType.LPAREN, "("),
        (TokenType.IDENT, "x"),
        (TokenType.COMMA, ","),
        (TokenType.IDENT, "y"),
        (TokenType.RPAREN, ")"),
        (TokenType.LBRACE, "{"),
        (TokenType.IDENT, "x"),
        (TokenType.PLUS, "+"),
        (TokenType.IDENT, "y"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.RBRACE, "}"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.LET, "let"),
        (TokenType.IDENT, "result"),
        (TokenType.ASSIGN, "="),
        (TokenType.IDENT, "add"),
        (TokenType.LPAREN, "("),
        (TokenType.IDENT, "five"),
        (TokenType.COMMA, ","),
        (TokenType.IDENT, "ten"),
        (TokenType.RPAREN, ")"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.BANG, "!"),
        (TokenType.MINUS, "-"),
        (TokenType.SLASH, "/"),
        (TokenType.ASTERISK, "*"),
        (TokenType.INT, "5"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.INT, "5"),
        (TokenType.LT, "<"),
        (TokenType.INT, "10"),
        (TokenType.GT, ">"),
        (TokenType.INT, "5"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.IF, "if"),
        (TokenType.LPAREN, "("),
        (TokenType.INT, "5"),
        (TokenType.LT, "<"),
        (TokenType.INT, "10"),
        (TokenType.RPAREN, ")"),
        (TokenType.LBRACE, "{"),
        (TokenType.RETURN, "return"),
        (TokenType.TRUE, "true"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.RBRACE, "}"),
        (TokenType.ELSE, "else"),
        (TokenType.LBRACE, "{"),
        (TokenType.RETURN, "return"),
        (TokenType.FALSE, "false"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.RBRACE, "}"),
        (TokenType.INT, "10"),
        (TokenType.EQ, "=="),
        (TokenType.INT, "10"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.INT, "10"),
        (TokenType.NOT_EQ, "!="),
        (TokenType.INT, "9"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.STRING, "foobar"),
        (TokenType.STRING, "foo bar"),
        (TokenType.LBRACKET, "["),
        (TokenType.INT, "1"),
        (TokenType.COMMA, ","),
        (TokenType.INT, "2"),
        (TokenType.RBRACKET, "]"),
        (TokenType.SEMICOLON, ";"),
        (TokenType.LBRACE, "{"),
        (TokenType.STRING, "foo"),
        (TokenType.COLON, ":"),
        (TokenType.STRING, "bar"),
        (TokenType.RBRACE, "}"),
        (TokenType.EOF, ""),
    ]

    tokenizer = Tokenizer(input_text)

    for i, (expected_type, expected_literal) in enumerate(tests):
        tok = tokenizer.next_token()

        assert tok.type == expected_type, f"tests[{i}] - tokentype wrong. expected={expected_type}, got={tok.type}"
        assert tok.literal == expected_literal, f"tests[{i}] - literal wrong. expected={expected_literal}, got={tok.literal}"

    print("Tokenizer tests passed!")

if __name__ == "__main__":
    test_next_token()

Now let’s implement the tokenizer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from token import Token, TokenType, single_char_tokens
from typing import Optional

class Tokenizer:
    def __init__(self, input: str):
        self.input = input
        self.position = 0
        self.read_position = 0
        self.ch: Optional[str] = None
        self.read_char()

    def __post_init__(self):
        self.read_char()

    def read_char(self):
        if self.read_position >= len(self.input):
            self.ch = None
        else:
            self.ch = self.input[self.read_position]
        self.position = self.read_position
        self.read_position += 1

    def peek_char(self):
        if self.read_position >= len(self.input):
            return None
        else:
            return self.input[self.read_position]

    def read_identifier(self):
        position = self.position
        while self.ch is not None and (self.ch.isalpha() or self.ch == '_'):
            self.read_char()
        return self.input[position:self.position]

    def read_number(self):
        position = self.position
        while self.ch is not None and self.ch.isdigit():
            self.read_char()
        return self.input[position:self.position]

    def read_string(self):
        position = self.position + 1
        while self.ch != '"' and self.ch is not None:
            self.read_char()
        return self.input[position:self.position]

    def skip_whitespace(self):
        while self.ch is not None and self.ch.isspace():
            self.read_char()


    def next_token(self):
        self.skip_whitespace()

        if self.ch is None:
            return Token(TokenType.EOF, "")

        if self.ch in single_char_tokens:
            tok_type = single_char_tokens[self.ch]
            tok_literal = self.ch
            self.read_char()
            return Token(tok_type, tok_literal)

        if self.ch == '"':
            tok_type = TokenType.STRING
            tok_literal = self.read_string()
            return Token(tok_type, tok_literal)

        if self.ch.isdigit():
            tok_type = TokenType.INT
            tok_literal = self.read_number()
            return Token(tok_type, tok_literal)

        if self.ch.isalpha() or self.ch == '_':
            tok_literal = self.read_identifier()
            tok_type = TokenType(t)
            return Token(tok_type, tok_literal)

        return Token(TokenType.ILLEGAL, self.ch)

Now let’s define the AST,

a program is a list of statements

each statement is a node in AST

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
from token import Token, TokenType
from typing import List, Optional
from dataclasses import dataclass

# The base Node interface
@dataclass
class Node:
    def token_literal(self) -> str:
        raise NotImplementedError

    def __str__(self) -> str:
        raise NotImplementedError

# All statement nodes implement this
@dataclass
class Statement(Node):
    def token_literal(self) -> str:
        raise NotImplementedError

# All expression nodes implement this
@dataclass
class Expression(Node):
    def token_literal(self) -> str:
        raise NotImplementedError

@dataclass
class Program(Node):
    statements: List[Statement]

    def token_literal(self) -> str:
        if self.statements:
            return self.statements[0].token_literal()
        return ""

    def __str__(self) -> str:
        return "".join(str(stmt) for stmt in self.statements)

@dataclass
class LetStatement(Statement):
    token: Token
    name: Expression
    value: Expression

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return f"{self.token_literal()} {self.name} = {self.value};"

@dataclass
class ReturnStatement(Statement):
    token: Token
    return_value: Optional[Expression]

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        if self.return_value is not None:
            return f"{self.token_literal()} {self.return_value};"
        return f"{self.token_literal()};"

@dataclass
class ExpressionStatement(Statement):
    token: Token
    expression: Optional[Expression]

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        if self.expression is not None:
            return str(self.expression)
        return ""

@dataclass
class BlockStatement(Statement):
    token: Token
    statements: List[Statement]

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return "".join(str(stmt) for stmt in self.statements)

@dataclass
class Identifier(Expression):
    token: Token
    value: str

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return self.value

@dataclass
class Boolean(Expression):
    token: Token
    value: bool

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return self.token.literal

@dataclass
class IntegerLiteral(Expression):
    token: Token
    value: int

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return self.token.literal

@dataclass
class PrefixExpression(Expression):
    token: Token
    operator: str
    right: Expression

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return f"({self.operator}{self.right})"

@dataclass
class InfixExpression(Expression):
    token: Token
    left: Expression
    operator: str
    right: Expression

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return f"({self.left} {self.operator} {self.right})"

@dataclass
class IfExpression(Expression):
    token: Token
    condition: Expression
    consequence: BlockStatement
    alternative: Optional[BlockStatement]

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        result = f"if {self.condition} {self.consequence}"
        if self.alternative:
            result += f" else {self.alternative}"
        return result

@dataclass
class FunctionLiteral(Expression):
    token: Token
    parameters: List[Identifier]
    body: BlockStatement

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        params = ", ".join(str(param) for param in self.parameters)
        return f"{self.token_literal()}({params}) {self.body}"

@dataclass
class CallExpression(Expression):
    token: Token
    function: Expression
    arguments: List[Expression]

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        args = ", ".join(str(arg) for arg in self.arguments)
        return f"{self.function}({args})"

@dataclass
class StringLiteral(Expression):
    token: Token
    value: str

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return self.token.literal

@dataclass
class ArrayLiteral(Expression):
    token: Token
    elements: List[Expression]

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        elements = ", ".join(str(elem) for elem in self.elements)
        return f"[{elements}]"

@dataclass
class IndexExpression(Expression):
    token: Token
    left: Expression
    index: Expression

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        return f"({self.left}[{self.index}])"

@dataclass
class HashLiteral(Expression):
    token: Token
    pairs: Dict[Expression, Expression]

    def token_literal(self) -> str:
        return self.token.literal

    def __str__(self) -> str:
        pairs = ", ".join(f"{key}: {value}" for key, value in self.pairs.items())
        return f""

To be continued..

🎰