ribbit/src/ts/serializer.ts

199 lines
6.7 KiB
TypeScript
Raw Normal View History

/*
* serializer.ts DOM to markdown serializer.
*
* Converts an HTML DOM tree back to markdown by walking the tree and
* producing a typed token stream. Text tokens are escaped during final
* serialization; delimiter tokens pass through verbatim. This separation
* is what makes round-trip correctness possible the serializer always
* knows which characters are structural and which are literal.
*
* const serializer = new MarkdownSerializer(tagMap, delimiterChars);
* serializer.serialize(document.getElementById('content'))
* // '**bold** and *italic*'
*/
import type { InlineToken } from './tokenizer';
/**
* Maps HTML element names to their markdown serialization.
* Each entry defines how to convert an element back to markdown tokens.
*/
export interface SerializerTagDef {
/** The canonical delimiter (e.g. '**' for bold). */
delimiter?: string;
/** Custom serializer for elements that aren't simple delimiter wraps
* (e.g. links, code blocks, headings). Returns the full markdown
* string for the element and its children. */
serialize?: (element: HTMLElement, children: () => string) => string;
}
/**
* Converts a DOM tree to markdown. Walks the tree producing inline
* tokens, then serializes the token stream to a string with correct
* escaping.
*
* const serializer = new MarkdownSerializer(tagMap, new Set(['*', '`', '~', '[', '_']));
* const markdown = serializer.serialize(containerElement);
*/
export class MarkdownSerializer {
private tagMap: Map<string, SerializerTagDef>;
private delimiterChars: Set<string>;
constructor(
tagMap: Map<string, SerializerTagDef>,
delimiterChars: Set<string>,
) {
this.tagMap = tagMap;
this.delimiterChars = delimiterChars;
}
/**
* Serialize a DOM tree to a markdown string.
*
* serializer.serialize(document.querySelector('article'))
*/
serialize(node: Node): string {
const tokens = this.nodeToTokens(node);
return this.tokensToString(tokens);
}
/**
* Convert a DOM node to a stream of inline tokens.
* Text nodes become text tokens; elements with known tags
* become delimiter-wrapped token sequences; unknown elements
* recurse into their children.
*/
private nodeToTokens(node: Node): InlineToken[] {
if (node.nodeType === 3) {
return [{
role: 'text',
value: node.textContent || '',
}];
}
if (node.nodeType !== 1) {
return [];
}
const element = node as HTMLElement;
const tagDef = this.tagMap.get(element.nodeName);
// Custom serializer handles the entire element
if (tagDef?.serialize) {
const childrenMarkdown = () => this.serializeChildren(element);
const markdown = tagDef.serialize(element, childrenMarkdown);
// Custom serializers return raw markdown strings — wrap
// in a single text token that won't be escaped (it's already
// correctly formatted)
return [{
role: 'html',
value: markdown,
}];
}
// Delimiter-based element: emit open + children + close
if (tagDef?.delimiter) {
const delimiter = tagDef.delimiter;
return [
{
role: 'open',
value: delimiter,
delimiter,
},
...this.childrenToTokens(element),
{
role: 'close',
value: delimiter,
delimiter,
},
];
}
// Unknown element: just recurse into children
return this.childrenToTokens(element);
}
/**
* Collect tokens from all child nodes of an element.
*/
private childrenToTokens(element: HTMLElement): InlineToken[] {
const tokens: InlineToken[] = [];
for (const child of Array.from(element.childNodes)) {
tokens.push(...this.nodeToTokens(child));
}
return tokens;
}
/**
* Serialize an element's children directly to a markdown string.
* Used by custom serializers (links, headings, etc.) that need
* the children as a string, not as tokens.
*/
private serializeChildren(element: HTMLElement): string {
const tokens = this.childrenToTokens(element);
return this.tokensToString(tokens);
}
/**
* Convert a token stream to a markdown string. This is where
* escaping happens: text tokens have their delimiter characters
* backslash-escaped; all other token types pass through verbatim.
*/
private tokensToString(tokens: InlineToken[]): string {
let result = '';
for (const token of tokens) {
switch (token.role) {
case 'text':
result += this.escapeText(token.value);
break;
case 'open':
case 'close':
case 'html':
case 'break':
// Structural tokens are never escaped
result += token.value;
break;
case 'code':
result += token.value;
break;
case 'link':
result += token.value;
break;
case 'autolink':
result += token.value;
break;
default:
result += token.value;
}
}
return result;
}
/**
* Escape characters in literal text that would be misinterpreted
* as markdown syntax on re-parse. Only escapes characters that are
* registered as delimiter characters, plus `\`, `[`, `_`, and `<`
* before letters (HTML passthrough prevention).
*/
private escapeText(text: string): string {
let result = '';
for (let position = 0; position < text.length; position++) {
const character = text[position];
if (character === '\\') {
result += '\\\\';
} else if (character === '_') {
result += '\\_';
} else if (character === '[') {
result += '\\[';
} else if (character === '<' && position + 1 < text.length && /[a-zA-Z/]/.test(text[position + 1])) {
// Only escape < when it would start an HTML tag
result += '\\<';
} else if (this.delimiterChars.has(character)) {
result += '\\' + character;
} else {
result += character;
}
}
return result;
}
}