ribbit/src/ts/serializer.ts

/*
 * serializer.ts — DOM to markdown serializer.
 *
 * Converts an HTML DOM tree back to markdown by walking the tree and
 * producing a typed token stream. Text tokens are escaped during final
 * serialization; delimiter tokens pass through verbatim. This separation
 * is what makes round-trip correctness possible — the serializer always
 * knows which characters are structural and which are literal.
 *
 *   const serializer = new MarkdownSerializer(tagMap, delimiterChars);
 *   serializer.serialize(document.getElementById('content'))
 *   // '**bold** and *italic*'
 */

import type { InlineToken } from './tokenizer';

/**
 * Maps HTML element names to their markdown serialization.
 * Each entry defines how to convert an element back to markdown tokens.
 */
export interface SerializerTagDef {
    /** The canonical delimiter (e.g. '**' for bold). */
    delimiter?: string;
    /** Custom serializer for elements that aren't simple delimiter wraps
     *  (e.g. links, code blocks, headings). Returns the full markdown
     *  string for the element and its children. */
    serialize?: (element: HTMLElement, children: () => string) => string;
}

/**
 * Converts a DOM tree to markdown. Walks the tree producing inline
 * tokens, then serializes the token stream to a string with correct
 * escaping.
 *
 *   const serializer = new MarkdownSerializer(tagMap, new Set(['*', '`', '~', '[', '_']));
 *   const markdown = serializer.serialize(containerElement);
 */
export class MarkdownSerializer {
    private tagMap: Map<string, SerializerTagDef>;
    private delimiterChars: Set<string>;

    constructor(
        tagMap: Map<string, SerializerTagDef>,
        delimiterChars: Set<string>,
    ) {
        this.tagMap = tagMap;
        this.delimiterChars = delimiterChars;
    }

    /**
     * Serialize a DOM tree to a markdown string.
     *
     *   serializer.serialize(document.querySelector('article'))
     */
    serialize(node: Node): string {
        const tokens = this.nodeToTokens(node);
        return this.tokensToString(tokens);
    }

    /**
     * Convert a DOM node to a stream of inline tokens.
     * Text nodes become text tokens; elements with known tags
     * become delimiter-wrapped token sequences; unknown elements
     * recurse into their children.
     */
    private nodeToTokens(node: Node): InlineToken[] {
        if (node.nodeType === 3) {
            return [{
                role: 'text',
                value: node.textContent || '',
            }];
        }
        if (node.nodeType !== 1) {
            return [];
        }

        const element = node as HTMLElement;
        const tagDef = this.tagMap.get(element.nodeName);

        // Custom serializer handles the entire element
        if (tagDef?.serialize) {
            const childrenMarkdown = () => this.serializeChildren(element);
            const markdown = tagDef.serialize(element, childrenMarkdown);
            // Custom serializers return raw markdown strings — wrap
            // in a single text token that won't be escaped (it's already
            // correctly formatted)
            return [{
                role: 'html',
                value: markdown,
            }];
        }

        // Delimiter-based element: emit open + children + close
        if (tagDef?.delimiter) {
            const delimiter = tagDef.delimiter;
            return [
                {
                    role: 'open',
                    value: delimiter,
                    delimiter,
                },
                ...this.childrenToTokens(element),
                {
                    role: 'close',
                    value: delimiter,
                    delimiter,
                },
            ];
        }

        // Unknown element: just recurse into children
        return this.childrenToTokens(element);
    }

    /**
     * Collect tokens from all child nodes of an element.
     */
    private childrenToTokens(element: HTMLElement): InlineToken[] {
        const tokens: InlineToken[] = [];
        for (const child of Array.from(element.childNodes)) {
            tokens.push(...this.nodeToTokens(child));
        }
        return tokens;
    }

    /**
     * Serialize an element's children directly to a markdown string.
     * Used by custom serializers (links, headings, etc.) that need
     * the children as a string, not as tokens.
     */
    private serializeChildren(element: HTMLElement): string {
        const tokens = this.childrenToTokens(element);
        return this.tokensToString(tokens);
    }

    /**
     * Convert a token stream to a markdown string. This is where
     * escaping happens: text tokens have their delimiter characters
     * backslash-escaped; all other token types pass through verbatim.
     */
    private tokensToString(tokens: InlineToken[]): string {
        let result = '';
        for (const token of tokens) {
            switch (token.role) {
                case 'text':
                    result += this.escapeText(token.value);
                    break;
                case 'open':
                case 'close':
                case 'html':
                case 'break':
                    // Structural tokens are never escaped
                    result += token.value;
                    break;
                case 'code':
                    result += token.value;
                    break;
                case 'link':
                    result += token.value;
                    break;
                case 'autolink':
                    result += token.value;
                    break;
                default:
                    result += token.value;
            }
        }
        return result;
    }

    /**
     * Escape characters in literal text that would be misinterpreted
     * as markdown syntax on re-parse. Only escapes characters that are
     * registered as delimiter characters, plus `\`, `[`, `_`, and `<`
     * before letters (HTML passthrough prevention).
     */
    private escapeText(text: string): string {
        let result = '';
        for (let position = 0; position < text.length; position++) {
            const character = text[position];
            if (character === '\\') {
                result += '\\\\';
            } else if (character === '_') {
                result += '\\_';
            } else if (character === '[') {
                result += '\\[';
            } else if (character === '<' && position + 1 < text.length && /[a-zA-Z/]/.test(text[position + 1])) {
                // Only escape < when it would start an HTML tag
                result += '\\<';
            } else if (this.delimiterChars.has(character)) {
                result += '\\' + character;
            } else {
                result += character;
            }
        }
        return result;
    }
}
Reimplement as a tokenizer with GFM parity 2026-04-29 15:48:36 -07:00			`/*`
			`* serializer.ts — DOM to markdown serializer.`
			`*`
			`* Converts an HTML DOM tree back to markdown by walking the tree and`
			`* producing a typed token stream. Text tokens are escaped during final`
			`* serialization; delimiter tokens pass through verbatim. This separation`
			`* is what makes round-trip correctness possible — the serializer always`
			`* knows which characters are structural and which are literal.`
			`*`
			`* const serializer = new MarkdownSerializer(tagMap, delimiterChars);`
			`* serializer.serialize(document.getElementById('content'))`
			`* // 'bold and italic'`
			`*/`

			`import type { InlineToken } from './tokenizer';`

			`/**`
			`* Maps HTML element names to their markdown serialization.`
			`* Each entry defines how to convert an element back to markdown tokens.`
			`*/`
			`export interface SerializerTagDef {`
			`/ The canonical delimiter (e.g. '' for bold). */`
			`delimiter?: string;`
			`/** Custom serializer for elements that aren't simple delimiter wraps`
			`* (e.g. links, code blocks, headings). Returns the full markdown`
			`* string for the element and its children. */`
			`serialize?: (element: HTMLElement, children: () => string) => string;`
			`}`

			`/**`
			`* Converts a DOM tree to markdown. Walks the tree producing inline`
			`* tokens, then serializes the token stream to a string with correct`
			`* escaping.`
			`*`
			* const serializer = new MarkdownSerializer(tagMap, new Set(['*', '`', '~', '[', '_']));
			`* const markdown = serializer.serialize(containerElement);`
			`*/`
			`export class MarkdownSerializer {`
			`private tagMap: Map<string, SerializerTagDef>;`
			`private delimiterChars: Set<string>;`

			`constructor(`
			`tagMap: Map<string, SerializerTagDef>,`
			`delimiterChars: Set<string>,`
			`) {`
			`this.tagMap = tagMap;`
			`this.delimiterChars = delimiterChars;`
			`}`

			`/**`
			`* Serialize a DOM tree to a markdown string.`
			`*`
			`* serializer.serialize(document.querySelector('article'))`
			`*/`
			`serialize(node: Node): string {`
			`const tokens = this.nodeToTokens(node);`
			`return this.tokensToString(tokens);`
			`}`

			`/**`
			`* Convert a DOM node to a stream of inline tokens.`
			`* Text nodes become text tokens; elements with known tags`
			`* become delimiter-wrapped token sequences; unknown elements`
			`* recurse into their children.`
			`*/`
			`private nodeToTokens(node: Node): InlineToken[] {`
			`if (node.nodeType === 3) {`
			`return [{`
			`role: 'text',`
			`value: node.textContent \|\| '',`
			`}];`
			`}`
			`if (node.nodeType !== 1) {`
			`return [];`
			`}`

			`const element = node as HTMLElement;`
			`const tagDef = this.tagMap.get(element.nodeName);`

			`// Custom serializer handles the entire element`
			`if (tagDef?.serialize) {`
			`const childrenMarkdown = () => this.serializeChildren(element);`
			`const markdown = tagDef.serialize(element, childrenMarkdown);`
			`// Custom serializers return raw markdown strings — wrap`
			`// in a single text token that won't be escaped (it's already`
			`// correctly formatted)`
			`return [{`
			`role: 'html',`
			`value: markdown,`
			`}];`
			`}`

			`// Delimiter-based element: emit open + children + close`
			`if (tagDef?.delimiter) {`
			`const delimiter = tagDef.delimiter;`
			`return [`
			`{`
			`role: 'open',`
			`value: delimiter,`
			`delimiter,`
			`},`
			`...this.childrenToTokens(element),`
			`{`
			`role: 'close',`
			`value: delimiter,`
			`delimiter,`
			`},`
			`];`
			`}`

			`// Unknown element: just recurse into children`
			`return this.childrenToTokens(element);`
			`}`

			`/**`
			`* Collect tokens from all child nodes of an element.`
			`*/`
			`private childrenToTokens(element: HTMLElement): InlineToken[] {`
			`const tokens: InlineToken[] = [];`
			`for (const child of Array.from(element.childNodes)) {`
			`tokens.push(...this.nodeToTokens(child));`
			`}`
			`return tokens;`
			`}`

			`/**`
			`* Serialize an element's children directly to a markdown string.`
			`* Used by custom serializers (links, headings, etc.) that need`
			`* the children as a string, not as tokens.`
			`*/`
			`private serializeChildren(element: HTMLElement): string {`
			`const tokens = this.childrenToTokens(element);`
			`return this.tokensToString(tokens);`
			`}`

			`/**`
			`* Convert a token stream to a markdown string. This is where`
			`* escaping happens: text tokens have their delimiter characters`
			`* backslash-escaped; all other token types pass through verbatim.`
			`*/`
			`private tokensToString(tokens: InlineToken[]): string {`
			`let result = '';`
			`for (const token of tokens) {`
			`switch (token.role) {`
			`case 'text':`
			`result += this.escapeText(token.value);`
			`break;`
			`case 'open':`
			`case 'close':`
			`case 'html':`
			`case 'break':`
			`// Structural tokens are never escaped`
			`result += token.value;`
			`break;`
			`case 'code':`
			`result += token.value;`
			`break;`
			`case 'link':`
			`result += token.value;`
			`break;`
			`case 'autolink':`
			`result += token.value;`
			`break;`
			`default:`
			`result += token.value;`
			`}`
			`}`
			`return result;`
			`}`

			`/**`
			`* Escape characters in literal text that would be misinterpreted`
			`* as markdown syntax on re-parse. Only escapes characters that are`
			* registered as delimiter characters, plus `\`, `[`, `_`, and `<`
			`* before letters (HTML passthrough prevention).`
			`*/`
			`private escapeText(text: string): string {`
			`let result = '';`
			`for (let position = 0; position < text.length; position++) {`
			`const character = text[position];`
			`if (character === '\\') {`
			`result += '\\\\';`
			`} else if (character === '_') {`
			`result += '\\_';`
			`} else if (character === '[') {`
			`result += '\\[';`
			`} else if (character === '<' && position + 1 < text.length && /[a-zA-Z/]/.test(text[position + 1])) {`
			`// Only escape < when it would start an HTML tag`
			`result += '\\<';`
			`} else if (this.delimiterChars.has(character)) {`
			`result += '\\' + character;`
			`} else {`
			`result += character;`
			`}`
			`}`
			`return result;`
			`}`
			`}`