From c4f41ad2502567f641652eb745707d2c2817973b Mon Sep 17 00:00:00 2001 From: Anhgelus Morhtuuzh Date: Sat, 18 Apr 2026 15:35:46 +0200 Subject: feat(lexer): enforce delimiter requirements --- src/dom/Element.zig | 10 +++--- src/dom/html.zig | 2 +- src/lexer/Lexed.zig | 11 ++++++- src/lexer/Lexer.zig | 91 +++++++++++++++++++++++++++++++++++++++-------------- 4 files changed, 83 insertions(+), 31 deletions(-) (limited to 'src') diff --git a/src/dom/Element.zig b/src/dom/Element.zig index 0cbddc4..524586c 100644 --- a/src/dom/Element.zig +++ b/src/dom/Element.zig @@ -21,7 +21,7 @@ literal: ?[]const u8 = null, /// Init a new Element with the given kind. /// The tag will never be escaped. -/// The owernship is always taken. +/// It always duplicates strings. pub fn init(alloc: Allocator, knd: Kind, tag: []const u8) !Self { var v = Self{ .kind = knd, @@ -36,7 +36,7 @@ pub fn init(alloc: Allocator, knd: Kind, tag: []const u8) !Self { /// Init a new literal element. /// The literal content will never be escaped, see initLitEscaped if you want to escape it. -/// The owernship is always taken. +/// It always duplicates strings. pub fn initLit(alloc: Allocator, literal: []const u8) !Self { var v = Self{ .kind = .literal, @@ -51,7 +51,7 @@ pub fn initLit(alloc: Allocator, literal: []const u8) !Self { /// Init a new literal element that is escaped. /// The literal content will be escaped, see initLit if you don't want this behavior. -/// The owernship is always taken. +/// It always duplicates strings. pub fn initLitEscaped(alloc: Allocator, literal: []const u8) !Self { const escaped = try html.escape(alloc, literal); defer alloc.free(escaped); @@ -198,7 +198,7 @@ fn doTest(alloc: Allocator, el: *Self, exp: []const u8) !void { test "void element" { var arena = std.heap.DebugAllocator(.{}).init; - defer _ = arena.deinit(); + defer if (arena.deinit() == .leak) std.debug.print("leaking!\n", .{}); const alloc = arena.allocator(); var br = try init(alloc, .void, "br"); @@ -220,7 +220,7 @@ test "void element" { test "content element" { var arena = std.heap.DebugAllocator(.{}).init; - defer _ = arena.deinit(); + defer if (arena.deinit() == .leak) std.debug.print("leaking!\n", .{}); const alloc = arena.allocator(); var p = try init(alloc, .content, "p"); diff --git a/src/dom/html.zig b/src/dom/html.zig index 47de020..a3178f2 100644 --- a/src/dom/html.zig +++ b/src/dom/html.zig @@ -35,7 +35,7 @@ fn doTest(alloc: std.mem.Allocator, el: []const u8, exp: []const u8) !void { test "escaping html" { var arena = std.heap.DebugAllocator(.{}).init; - defer _ = arena.deinit(); + defer if (arena.deinit() == .leak) std.debug.print("leaking!\n", .{}); const alloc = arena.allocator(); try doTest(alloc, "hello world", "hello world"); diff --git a/src/lexer/Lexed.zig b/src/lexer/Lexed.zig index b7c3b2c..4101953 100644 --- a/src/lexer/Lexed.zig +++ b/src/lexer/Lexed.zig @@ -3,7 +3,8 @@ const Allocator = std.mem.Allocator; pub const Kind = enum { literal, - delimiter, + weak_delimiter, + strong_delimiter, title, quote, code, @@ -19,6 +20,14 @@ pub const Kind = enum { list_ordored, list_unordored, tag, + + pub fn isDelimiter(self: @This()) bool { + return switch (self) { + .weak_delimiter => true, + .strong_delimiter => true, + else => false, + }; + } }; allocator: Allocator, diff --git a/src/lexer/Lexer.zig b/src/lexer/Lexer.zig index 2705347..f492be6 100644 --- a/src/lexer/Lexer.zig +++ b/src/lexer/Lexer.zig @@ -39,22 +39,21 @@ pub fn next(self: *Self, alloc: Allocator) Error!?Lexed { } // conds here to avoid creating complex condition in while const next_rune = self.iter.peek(1); - if (next_rune.len > 0) { - if (self.getCurrentKind(current_kind, next_rune, acc.items).kind != current_kind.? and - (override_if == null or !eql(u8, override_if.?, next_rune))) - { - if (!requiresSpace(current_kind.?)) break; - if (eql(u8, next_rune, " ")) { - // consume next space - _ = self.iter.nextCodepoint(); - break; - } - current_kind = switch (current_kind.?) { - .title => if (acc.items.len == 1) .tag else .literal, - else => .literal, - }; + if (requiresSpace(current_kind.?)) { + if (eql(u8, next_rune, " ")) { + // consume next space + _ = self.iter.nextCodepoint(); + break; } + current_kind = switch (current_kind.?) { + .title => if (acc.items.len == 1) .tag else .literal, + else => .literal, + }; } + if (next_rune.len > 0 and + self.getCurrentKind(current_kind, next_rune, acc.items).kind != current_kind.? and + (override_if == null or !eql(u8, override_if.?, next_rune))) + break; } const kind = current_kind orelse { acc.deinit(alloc); @@ -76,17 +75,24 @@ const kindRes = struct { } }; +fn requiresDelimiter(before: ?Lexed.Kind, knd: Lexed.Kind) Lexed.Kind { + return if (before == null or before.?.isDelimiter()) knd else .literal; +} + fn getCurrentKind(self: *Self, before: ?Lexed.Kind, rune: []const u8, acc: []const u8) kindRes { if (self.force_lit) return .{ .kind = .literal }; - if (eql(u8, rune, "\n")) return .{ .kind = .delimiter }; + if (eql(u8, rune, "\n")) return .{ + .kind = if (before == .weak_delimiter) .strong_delimiter else .weak_delimiter, + .override_if = rune, + }; if (eql(u8, rune, "*")) return .{ .kind = .bold }; if (eql(u8, rune, "_")) return .{ .kind = .italic }; - if (eql(u8, rune, ">")) return .{ .kind = .quote }; - if (eql(u8, rune, "-")) return .{ .kind = .list_unordored }; - if (eql(u8, rune, ".")) return .{ .kind = .list_ordored }; - if (eql(u8, rune, "!")) return .{ .kind = .image }; + if (eql(u8, rune, ">")) return .{ .kind = requiresDelimiter(before, .quote) }; + if (eql(u8, rune, ".")) return .{ .kind = requiresDelimiter(before, .list_ordored) }; + if (eql(u8, rune, "-")) return .{ .kind = requiresDelimiter(before, .list_unordored) }; + if (eql(u8, rune, "!")) return .{ .kind = requiresDelimiter(before, .image) }; if (eql(u8, rune, "<")) return .{ .kind = .ref }; - if (is('#', 6, rune, acc)) return .{ .kind = .title }; + if (is('#', 6, rune, acc)) return .{ .kind = requiresDelimiter(before, .title) }; if (isIn(links, rune, acc, before, .link)) return .{ .kind = .link }; if (isOneOrThree(":", rune, acc, .ref, .callout)) |it| return it; if (isOneOrThree("$", rune, acc, .math, .math_block)) |it| return it; @@ -95,9 +101,9 @@ fn getCurrentKind(self: *Self, before: ?Lexed.Kind, rune: []const u8, acc: []con } fn is(v: u8, maxLen: usize, rune: []const u8, acc: []const u8) bool { - if (acc.len >= maxLen) return false; - for (0..acc.len) |i| if (acc[i] != v) return false; - return eql(u8, rune, &[_]u8{v}); + if (!eql(u8, rune, &[_]u8{v})) return false; + for (acc) |it| if (it != v) return true; + return acc.len < maxLen; } const links = &[_][]const u8{ "[", "](", ")" }; @@ -177,7 +183,7 @@ test "lexer common" { const expect = std.testing.expect; var arena = std.heap.DebugAllocator(.{}).init; - defer _ = arena.deinit(); + defer if (arena.deinit() == .leak) std.debug.print("leaking!\n", .{}); const alloc = arena.allocator(); var l = try init("# hello world :)"); @@ -189,3 +195,40 @@ test "lexer common" { try expect(try l.next(alloc) == null); } + +test "lexer multiline" { + const expect = std.testing.expect; + + var arena = std.heap.DebugAllocator(.{}).init; + defer if (arena.deinit() == .leak) std.debug.print("leaking!\n", .{}); + const alloc = arena.allocator(); + + var l = try init( + \\# Title + \\ + \\paragraph + \\# a title + \\a # in sentence + \\ + \\#tag + \\#tag2 + ); + + try doTest(alloc, &l, .title, "#"); + try doTest(alloc, &l, .literal, "Title"); + try doTest(alloc, &l, .strong_delimiter, "\n\n"); + try doTest(alloc, &l, .literal, "paragraph"); + try doTest(alloc, &l, .weak_delimiter, "\n"); + try doTest(alloc, &l, .title, "#"); + try doTest(alloc, &l, .literal, "a title"); + try doTest(alloc, &l, .weak_delimiter, "\n"); + try doTest(alloc, &l, .literal, "a # in sentence"); + try doTest(alloc, &l, .strong_delimiter, "\n\n"); + try doTest(alloc, &l, .tag, "#"); + try doTest(alloc, &l, .literal, "tag"); + try doTest(alloc, &l, .weak_delimiter, "\n"); + try doTest(alloc, &l, .tag, "#"); + try doTest(alloc, &l, .literal, "tag2"); + + try expect(try l.next(alloc) == null); +} -- cgit v1.2.3