Fix HTML to MFM (#7150)

* Fix type * Fix HTML to MFM
2024-11-22 01:47:39 -07:00 · 2021-02-06 21:44:46 +09:00 · 2021-02-06 21:44:46 +09:00 · a1bbf7d9ca
commit a1bbf7d9ca
parent b7ed6a6c56
2 changed files with 55 additions and 19 deletions
--- a/src/mfm/from-html.ts
+++ b/src/mfm/from-html.ts
@ -1,8 +1,10 @@
-import { parseFragment, DefaultTreeDocumentFragment } from 'parse5';
-import { urlRegexFull } from './prelude';
+import * as parse5 from 'parse5';
+import treeAdapter = require('parse5/lib/tree-adapters/default');
+import { URL } from 'url';
+import { urlRegex, urlRegexFull } from './prelude';

 export function fromHtml(html: string, hashtagNames?: string[]): string {
-	const dom = parseFragment(html) as DefaultTreeDocumentFragment;
+	const dom = parse5.parseFragment(html);

 	let text = '';

@ -12,30 +14,35 @@ export function fromHtml(html: string, hashtagNames?: string[]): string {

 	return text.trim();

-	function getText(node: any): string {
-		if (node.nodeName === '#text') return node.value;
+	function getText(node: parse5.Node): string {
+		if (treeAdapter.isTextNode(node)) return node.value;
+		if (!treeAdapter.isElementNode(node)) return '';

 		if (node.childNodes) {
-			return node.childNodes.map((n: any) => getText(n)).join('');
+			return node.childNodes.map(n => getText(n)).join('');
 		}

 		return '';
 	}

-	function analyze(node: any) {
-		switch (node.nodeName) {
-			case '#text':
-				text += node.value;
-				break;
+	function analyze(node: parse5.Node) {
+		if (treeAdapter.isTextNode(node)) {
+			text += node.value;
+			return;
+		}

+		// Skip comment or document type node
+		if (!treeAdapter.isElementNode(node)) return;
+
+		switch (node.nodeName) {
 			case 'br':
 				text += '\n';
 				break;

 			case 'a':
 				const txt = getText(node);
-				const rel = node.attrs.find((x: any) => x.name === 'rel');
-				const href = node.attrs.find((x: any) => x.name === 'href');
+				const rel = node.attrs.find(x => x.name === 'rel');
+				const href = node.attrs.find(x => x.name === 'href');

 				// ハッシュタグ
 				if (hashtagNames && href && hashtagNames.map(x => x.toLowerCase()).includes(txt.toLowerCase())) {
@ -44,7 +51,7 @@ export function fromHtml(html: string, hashtagNames?: string[]): string {
 				} else if (txt.startsWith('@') && !(rel && rel.value.match(/^me /))) {
 					const part = txt.split('@');

-					if (part.length === 2) {
+					if (part.length === 2 && href) {
 						//#region ホスト名部分が省略されているので復元する
 						const acct = `${txt}@${(new URL(href.value)).hostname}`;
 						text += acct;
@ -54,11 +61,28 @@ export function fromHtml(html: string, hashtagNames?: string[]): string {
 					}
 				// その他
 				} else {
-					text += !href ? txt
-						: txt === href.value
-							? txt.match(urlRegexFull) ? txt
-							: `<${txt}>`
-						: `[${txt}](${href.value})`;
+					const generateLink = () => {
+						if (!href && !txt) {
+							return '';
+						}
+						if (!href) {
+							return txt;
+						}
+						if (!txt || txt === href.value) {	// #6383: Missing text node
+							if (href.value.match(urlRegexFull)) {
+								return href.value;
+							} else {
+								return `<${href.value}>`;
+							}
+						}
+						if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) {
+							return `[${txt}](<${href.value}>)`;	// #6846
+						} else {
+							return `[${txt}](${href.value})`;
+						}
+					};
+
+					text += generateLink();
 				}
 				break;

--- a/test/mfm.ts
+++ b/test/mfm.ts
@ -1167,6 +1167,10 @@ describe('fromHtml', () => {
 		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/b">c</a> d</p>'), 'a [c](https://example.com/b) d');
 	});

+	it('link with different text, but not encoded', () => {
+		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/ä">c</a> d</p>'), 'a [c](<https://example.com/ä>) d');
+	});
+
 	it('link with same text', () => {
 		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/b">https://example.com/b</a> d</p>'), 'a https://example.com/b d');
 	});
@ -1183,6 +1187,14 @@ describe('fromHtml', () => {
 		assert.deepStrictEqual(fromHtml('<p>a <a>c</a> d</p>'), 'a c d');
 	});

+	it('link without text', () => {
+		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/b"></a> d</p>'), 'a https://example.com/b d');
+	});
+
+	it('link without both', () => {
+		assert.deepStrictEqual(fromHtml('<p>a <a></a> d</p>'), 'a  d');
+	});
+
 	it('mention', () => {
 		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/@user" class="u-url mention">@user</a> d</p>'), 'a @user@example.com d');
 	});