aboutsummaryrefslogtreecommitdiff
path: root/src/lexer/lexer.zig
blob: 8488a1645bbc8ce9a5cbea81328f643d7372d298 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
const std = @import("std");
const Allocator = std.mem.Allocator;
const unicode = std.unicode;
const lexed = @import("lexed.zig");

const operators = [_][]const u8{ "*", "_", "`", "<", ">", ":", "!", "[", "]", "(", ")", "$", "-", "." };
const delimiters = [_][]const u8{"\n"};

pub const Lexer = struct {
    iter: unicode.Utf8Iterator,
    force_lit: bool = false,

    const Self = @This();

    pub const Error = error{
        InvalidUtf8,
    } || Allocator.Error;

    pub fn init(content: []const u8) Error!Lexer {
        const view = try unicode.Utf8View.init(content);
        return .{ .iter = view.iterator() };
    }

    pub fn next(self: *Self, alloc: Allocator) Error!?lexed.Lexed {
        var acc = try std.ArrayList(u8).initCapacity(alloc, 2);
        errdefer acc.deinit(alloc);

        var current_kind: ?lexed.Kind = null;
        while (self.iter.nextCodepointSlice()) |rune| {
            if (std.mem.eql(u8, rune, "\r")) continue;
            // escape chars
            if (std.mem.eql(u8, rune, "\\")) {
                self.force_lit = true;
                current_kind = .literal;
            } else {
                current_kind = self.getCurrentKind(rune); 
                self.force_lit = false;
                try acc.appendSlice(alloc, rune);
            }
            // conds here to avoid creating complex condition in while
            const next_rune = self.iter.peek(1);
            if (next_rune.len > 0) {
                if (self.getCurrentKind(next_rune) != current_kind.?) break;
            }
        }
        const kind = current_kind orelse {
            acc.deinit(alloc);
            return null;
        };
        return lexed.Lexed.init(alloc, kind, acc);
    }

    fn getCurrentKind(self: *Self, rune: []const u8) ?lexed.Kind {
        if (self.force_lit) return .literal;
        if (isIn(&operators, rune)) {
            return .operator;
        } else if (isIn(&delimiters, rune)) {
            return .delimiter;
        }
        return .literal;
    }
};

fn isIn(arr: []const []const u8, v: []const u8) bool {
    for (arr) |it| if (std.mem.eql(u8, it, v)) return true;
    return false;
}

test "literal" {
    const expect = std.testing.expect;

    var arena = std.heap.DebugAllocator(.{}){};
    defer _ = arena.deinit();
    const alloc = arena.allocator();

    var l = try Lexer.init("hello world :)");

    var first = (try l.next(alloc)).?;
    defer first.deinit();
    try expect(first.equals(.literal, "hello world "));

    var second = (try l.next(alloc)).?;
    defer second.deinit();
    try expect(second.equals(.operator, ":)"));

    try expect(try l.next(alloc) == null);
}