Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(markdown): improve markdown parser and diagnostics #5292

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
b712073
feat(markdown_parser): add blockquote block parsing support
afonsojramos Mar 7, 2025
6cfda97
feat(markdown_parser): add code block parsing support for indented an…
afonsojramos Mar 7, 2025
3e88015
feat(markdown_parser): add header block parsing support
afonsojramos Mar 7, 2025
421f9e8
feat(markdown_parser): add HTML block parsing support
afonsojramos Mar 7, 2025
339ba33
feat(markdown_parser): add list block parsing support
afonsojramos Mar 7, 2025
184aeec
feat(markdown_parser): add paragraph block parsing support
afonsojramos Mar 7, 2025
0ef00ca
feat(markdown_parser): add table block parsing support
afonsojramos Mar 7, 2025
d5cd19b
feat(markdown_parser): improve thematic break block parsing
afonsojramos Mar 7, 2025
f0758f3
feat(markdown_parser): add document parsing method
afonsojramos Mar 7, 2025
1d511ad
feat(markdown_parser): implement comprehensive document parsing with …
afonsojramos Mar 7, 2025
5c03604
feat(markdown_parser): add header validation and improve parsing robu…
afonsojramos Mar 7, 2025
7b4fbc6
test(markdown_parser): add test case for invalid markdown headers
afonsojramos Mar 7, 2025
7f822f2
test(markdown_parser): add test cases for blockquotes, comprehensive …
afonsojramos Mar 7, 2025
ba4f600
test(markdown_parser): refactor spec tests and add more flexible test…
afonsojramos Mar 7, 2025
381d45b
feat(markdown_parser): add support for additional Markdown syntax tok…
afonsojramos Mar 7, 2025
ec95111
feat(configuration): add Markdown configuration support
afonsojramos Mar 7, 2025
4a0fdf6
feat(cli): add Markdown linter configuration support
afonsojramos Mar 7, 2025
9b3bc39
Merge remote-tracking branch 'biome/main' into feat/markdown-parser
afonsojramos Mar 7, 2025
63d17b8
refactor(markdown_parser): improve parsing robustness for various Mar…
afonsojramos Mar 7, 2025
8991695
chore: run cargo fmt
afonsojramos Mar 7, 2025
833a088
chore: remove Markdown configuration
afonsojramos Mar 10, 2025
0222689
refactor(markdown_parser): remove test files and cleanup markdown con…
afonsojramos Mar 11, 2025
cf45339
feat(markdown_parser): improve thematic break parsing with more robus…
afonsojramos Mar 12, 2025
9a29b58
refactor(markdown_parser): improve blockquote parsing with enhanced l…
afonsojramos Mar 12, 2025
135c2b5
refactor(markdown_parser): improve header block parsing with simplifi…
afonsojramos Mar 12, 2025
18b9f0c
refactor(markdown_parser): improve lexer parsing for Markdown list ma…
afonsojramos Mar 12, 2025
804f2ba
refactor(markdown_parser): enhance parsing logic for Markdown documents
afonsojramos Mar 15, 2025
b45b44a
refactor(markdown_parser): enhance header block parsing with improved…
afonsojramos Mar 15, 2025
c9743c9
test(markdown_parser): add comprehensive tests for list marker parsing
afonsojramos Mar 15, 2025
e91ec37
refactor(markdown_parser): enhance list parsing logic with improved v…
afonsojramos Mar 15, 2025
952ca45
refactor(markdown_parser): streamline paragraph block parsing logic
afonsojramos Mar 15, 2025
0505548
refactor(markdown_parser): update snapshot files for blockquotes, he…
afonsojramos Mar 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions crates/biome_cli/src/commands/lint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,14 @@ impl CommandRunner for LintCommandPayload {
.get_or_insert_with(Default::default);
graphql.linter.merge_with(self.graphql_linter.clone());
}

if self.javascript_linter.is_some() {
let javascript = fs_configuration
.javascript
.get_or_insert_with(Default::default);
javascript.linter.merge_with(self.javascript_linter.clone());
}

if self.json_linter.is_some() {
let json = fs_configuration.json.get_or_insert_with(Default::default);
json.linter.merge_with(self.json_linter.clone());
Expand Down
2 changes: 1 addition & 1 deletion crates/biome_markdown_factory/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ pub use crate::generated::MarkdownSyntaxFactory;
#[doc(hidden)]
pub use biome_markdown_syntax as syntax;

pub type DemoSyntaxTreeBuilder = TreeBuilder<'static, MarkdownLanguage, MarkdownSyntaxFactory>;
pub type MarkdownSyntaxTreeBuilder = TreeBuilder<'static, MarkdownLanguage, MarkdownSyntaxFactory>;

pub mod make;
122 changes: 122 additions & 0 deletions crates/biome_markdown_factory/src/make.rs
Original file line number Diff line number Diff line change
@@ -1 +1,123 @@
use biome_markdown_syntax::{MarkdownSyntaxKind, MarkdownSyntaxToken};

pub use crate::generated::node_factory::*;

/// Create a textual token
pub fn textual(text: &str) -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::MD_TEXTUAL_LITERAL, text, [], [])
}

/// Create a string token
pub fn string(text: &str) -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::MD_STRING_LITERAL, text, [], [])
}

/// Create a hash token for headers
pub fn hash() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::HASH, "#", [], [])
}

/// Create a backtick token
pub fn backtick() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::BACKTICK, "`", [], [])
}

/// Create a star token for emphasis
pub fn star() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::STAR, "*", [], [])
}

/// Create an underscore token for emphasis
pub fn underscore() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::UNDERSCORE, "_", [], [])
}

/// Create a left bracket token
pub fn l_brack() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::L_BRACK, "[", [], [])
}

/// Create a right bracket token
pub fn r_brack() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::R_BRACK, "]", [], [])
}

/// Create a left parenthesis token
pub fn l_paren() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::L_PAREN, "(", [], [])
}

/// Create a right parenthesis token
pub fn r_paren() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::R_PAREN, ")", [], [])
}

/// Create a bang token for images
pub fn bang() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::BANG, "!", [], [])
}

/// Create a minus token for thematic breaks
pub fn minus() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::MINUS, "-", [], [])
}

/// Create a thematic break token
pub fn thematic_break() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::MD_THEMATIC_BREAK_LITERAL, "---", [], [])
}

/// Create a newline token
pub fn newline() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::NEWLINE, "\n", [], [])
}

/// Create a whitespace token
pub fn whitespace(text: &str) -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::WHITESPACE, text, [], [])
}

/// Create a tab token
pub fn tab() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::TAB, "\t", [], [])
}

/// Create an indent chunk token for indented code blocks
pub fn indent_chunk() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::MD_INDENT_CHUNK_LITERAL, " ", [], [])
}

/// Create a hard line break token
pub fn hard_line_break() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::MD_HARD_LINE_LITERAL, " \n", [], [])
}

/// Create a greater than token for blockquotes
pub fn greater_than() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::R_ANGLE, ">", [], [])
}

/// Create a plus token for unordered lists
pub fn plus() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::PLUS, "+", [], [])
}

/// Create a digit token for ordered lists
pub fn digit(text: &str) -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::DIGIT, text, [], [])
}

/// Create a period token for ordered lists
pub fn period() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::PERIOD, ".", [], [])
}

/// Create a pipe token for tables
pub fn pipe() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::PIPE, "|", [], [])
}

/// Create a colon token for table alignment
pub fn colon() -> MarkdownSyntaxToken {
MarkdownSyntaxToken::new_detached(MarkdownSyntaxKind::COLON, ":", [], [])
}
154 changes: 136 additions & 18 deletions crates/biome_markdown_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,11 @@ impl<'src> MarkdownLexer<'src> {
let dispatched = lookup_byte(current);
match dispatched {
WHS => self.consume_newline_or_whitespace(),
MUL | MIN | IDT => self.consume_thematic_break_literal(),
MUL => self.consume_star(),
MIN => self.consume_minus(),
IDT => self.consume_underscore(),
DIG => self.consume_digit(),
PLS => self.consume_plus(),
_ => self.consume_textual(),
}
}
Expand Down Expand Up @@ -269,31 +273,128 @@ impl<'src> MarkdownLexer<'src> {
TAB
}

fn consume_thematic_break_literal(&mut self) -> MarkdownSyntaxKind {
fn consume_star(&mut self) -> MarkdownSyntaxKind {
self.assert_at_char_boundary();

let start_char = match self.current_byte() {
Some(b'-') => b'-',
Some(b'*') => b'*',
Some(b'_') => b'_',
_ => return self.consume_textual(),
};
// First check if this might be a thematic break
let checkpoint = self.position;

let mut count = 0;
loop {
self.consume_whitespace();
if matches!(self.current_byte(), Some(ch) if ch == start_char) {
self.advance(1);
count += 1;
// Try to recognize thematic breaks like "***" or "* * *"
if self.is_thematic_break(b'*') {
return MD_THEMATIC_BREAK_LITERAL;
}

// Reset position after thematic break check
self.position = checkpoint;

// Check for list marker (* )
self.advance(1); // Consume the star

if matches!(self.current_byte(), Some(b' ' | b'\t')) {
// It's a list marker
return STAR;
}

// Not a special token, just a regular star
self.position = checkpoint;
self.consume_textual()
}

fn consume_minus(&mut self) -> MarkdownSyntaxKind {
self.assert_at_char_boundary();

// First check if this might be a thematic break
let checkpoint = self.position;

// Try to recognize thematic breaks like "---" or "- - -"
if self.is_thematic_break(b'-') {
return MD_THEMATIC_BREAK_LITERAL;
}

// Reset position after thematic break check
self.position = checkpoint;

// Check for list marker (- )
self.advance(1); // Consume the minus

if matches!(self.current_byte(), Some(b' ' | b'\t')) {
// It's a list marker
return MINUS;
}

// Not a special token, just a regular minus
self.position = checkpoint;
self.consume_textual()
}

fn consume_underscore(&mut self) -> MarkdownSyntaxKind {
self.assert_at_char_boundary();

// Check if this is a thematic break
let checkpoint = self.position;

// Try to recognize thematic breaks like "___" or "_ _ _"
if self.is_thematic_break(b'_') {
return MD_THEMATIC_BREAK_LITERAL;
}

// Not a thematic break or emphasis
self.position = checkpoint;
self.consume_textual()
}

/// Check if the current position starts a thematic break
/// This handles patterns like "---", "***", "___" as well as "- - -", "* * *", "_ _ _"
fn is_thematic_break(&mut self, marker: u8) -> bool {
let mut marker_count = 0;
let mut pos = self.position;
let src = self.source.as_bytes();

while pos < src.len() {
if pos < src.len() && src[pos] == marker {
marker_count += 1;
pos += 1;
} else if pos < src.len() && (src[pos] == b' ' || src[pos] == b'\t') {
pos += 1;
} else {
break;
}
}
// until next newline or eof
if matches!(self.current_byte(), Some(b'\n' | b'\r') | None) && count >= 3 {
return MD_THEMATIC_BREAK_LITERAL;

// A valid thematic break must have at least 3 markers and be followed by a newline or EOF
if marker_count >= 3 && (pos >= src.len() || src[pos] == b'\n' || src[pos] == b'\r') {
// Consume the entire thematic break
self.position = pos;
return true;
}
ERROR_TOKEN

false
}

fn consume_digit(&mut self) -> MarkdownSyntaxKind {
self.assert_at_char_boundary();

// Check if this is a list marker (1. ) or just a digit
let checkpoint = self.position;

// Consume all digits
while matches!(self.current_byte(), Some(b'0'..=b'9')) {
self.advance(1);
}

// Check for period or closing parenthesis
if matches!(self.current_byte(), Some(b'.') | Some(b')')) {
self.advance(1);

// Check for whitespace
if matches!(self.current_byte(), Some(b' ' | b'\t')) {
return DIGIT;
}
}

// Not a list marker, reset and parse as textual
self.position = checkpoint;
self.consume_textual()
}

/// Get the UTF8 char which starts at the current byte
Expand Down Expand Up @@ -368,6 +469,23 @@ impl<'src> MarkdownLexer<'src> {
self.advance(1);
tok
}

fn consume_plus(&mut self) -> MarkdownSyntaxKind {
self.assert_at_char_boundary();

// Check for list marker (+ )
let checkpoint = self.position;
self.advance(1); // Consume the plus

if matches!(self.current_byte(), Some(b' ' | b'\t')) {
// It's a list marker
return PLUS;
}

// Not a special token, just a regular plus
self.position = checkpoint;
self.consume_textual()
}
}

impl<'src> ReLexer<'src> for MarkdownLexer<'src> {
Expand Down
54 changes: 54 additions & 0 deletions crates/biome_markdown_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,57 @@ _ _ _ _ _ "#,
MD_THEMATIC_BREAK_LITERAL:11,
}
}

// This is a test for list markers
#[test]
fn list_markers() {
// Testing with a star list marker
let mut lexer = MarkdownLexer::from_str("* List item");
let mut tokens = Vec::new();

while lexer.next_token(MarkdownLexContext::default()) != EOF {
tokens.push(lexer.current());
}

assert_eq!(tokens[0], STAR);
assert_eq!(tokens[1], WHITESPACE);

// Testing with a minus list marker
let mut lexer = MarkdownLexer::from_str("- List item");
let mut tokens = Vec::new();

while lexer.next_token(MarkdownLexContext::default()) != EOF {
tokens.push(lexer.current());
}

assert_eq!(tokens[0], MINUS);
assert_eq!(tokens[1], WHITESPACE);

// Testing with a plus list marker
let mut lexer = MarkdownLexer::from_str("+ List item");
let mut tokens = Vec::new();

while lexer.next_token(MarkdownLexContext::default()) != EOF {
tokens.push(lexer.current());
}

assert_eq!(tokens[0], PLUS);
assert_eq!(tokens[1], WHITESPACE);

// Testing with a digit list marker
let mut lexer = MarkdownLexer::from_str("1. List item");
let mut tokens = Vec::new();

while lexer.next_token(MarkdownLexContext::default()) != EOF {
tokens.push(lexer.current());
}

// The token sequence for "1. List item" should be:
// 1. DIGIT for "1"
// 2. MD_TEXTUAL_LITERAL for "."
// 3. WHITESPACE for " "
// 4+. MD_TEXTUAL_LITERAL for each letter in "List item"
assert_eq!(tokens[0], DIGIT);
assert_eq!(tokens[1], MD_TEXTUAL_LITERAL);
assert_eq!(tokens[2], WHITESPACE);
}
Loading