From a1bbf7d9ca9a6a8536022742dff9e1565a03eeb1 Mon Sep 17 00:00:00 2001
From: MeiMei <30769358+mei23@users.noreply.github.com>
Date: Sat, 6 Feb 2021 21:44:46 +0900
Subject: [PATCH] Fix HTML to MFM (#7150)

* Fix type

* Fix HTML to MFM
---
 src/mfm/from-html.ts | 62 ++++++++++++++++++++++++++++++--------------
 test/mfm.ts          | 12 +++++++++
 2 files changed, 55 insertions(+), 19 deletions(-)
diff --git a/src/mfm/from-html.ts b/src/mfm/from-html.ts
index 4c27c2cbb..0b4f9b894 100644
--- a/src/mfm/from-html.ts
+++ b/src/mfm/from-html.ts
@@ -1,8 +1,10 @@
-import { parseFragment, DefaultTreeDocumentFragment } from 'parse5';
-import { urlRegexFull } from './prelude';
+import * as parse5 from 'parse5';
+import treeAdapter = require('parse5/lib/tree-adapters/default');
+import { URL } from 'url';
+import { urlRegex, urlRegexFull } from './prelude';
 
 export function fromHtml(html: string, hashtagNames?: string[]): string {
-	const dom = parseFragment(html) as DefaultTreeDocumentFragment;
+	const dom = parse5.parseFragment(html);
 
 	let text = '';
 
@@ -12,30 +14,35 @@ export function fromHtml(html: string, hashtagNames?: string[]): string {
 
 	return text.trim();
 
-	function getText(node: any): string {
-		if (node.nodeName === '#text') return node.value;
+	function getText(node: parse5.Node): string {
+		if (treeAdapter.isTextNode(node)) return node.value;
+		if (!treeAdapter.isElementNode(node)) return '';
 
 		if (node.childNodes) {
-			return node.childNodes.map((n: any) => getText(n)).join('');
+			return node.childNodes.map(n => getText(n)).join('');
 		}
 
 		return '';
 	}
 
-	function analyze(node: any) {
-		switch (node.nodeName) {
-			case '#text':
-				text += node.value;
-				break;
+	function analyze(node: parse5.Node) {
+		if (treeAdapter.isTextNode(node)) {
+			text += node.value;
+			return;
+		}
 
+		// Skip comment or document type node
+		if (!treeAdapter.isElementNode(node)) return;
+
+		switch (node.nodeName) {
 			case 'br':
 				text += '\n';
 				break;
 
 			case 'a':
 				const txt = getText(node);
-				const rel = node.attrs.find((x: any) => x.name === 'rel');
-				const href = node.attrs.find((x: any) => x.name === 'href');
+				const rel = node.attrs.find(x => x.name === 'rel');
+				const href = node.attrs.find(x => x.name === 'href');
 
 				// ハッシュタグ
 				if (hashtagNames && href && hashtagNames.map(x => x.toLowerCase()).includes(txt.toLowerCase())) {
@@ -44,7 +51,7 @@ export function fromHtml(html: string, hashtagNames?: string[]): string {
 				} else if (txt.startsWith('@') && !(rel && rel.value.match(/^me /))) {
 					const part = txt.split('@');
 
-					if (part.length === 2) {
+					if (part.length === 2 && href) {
 						//#region ホスト名部分が省略されているので復元する
 						const acct = `${txt}@${(new URL(href.value)).hostname}`;
 						text += acct;
@@ -54,11 +61,28 @@ export function fromHtml(html: string, hashtagNames?: string[]): string {
 					}
 				// その他
 				} else {
-					text += !href ? txt
-						: txt === href.value
-							? txt.match(urlRegexFull) ? txt
-							: `<${txt}>`
-						: `[${txt}](${href.value})`;
+					const generateLink = () => {
+						if (!href && !txt) {
+							return '';
+						}
+						if (!href) {
+							return txt;
+						}
+						if (!txt || txt === href.value) {	// #6383: Missing text node
+							if (href.value.match(urlRegexFull)) {
+								return href.value;
+							} else {
+								return `<${href.value}>`;
+							}
+						}
+						if (href.value.match(urlRegex) && !href.value.match(urlRegexFull)) {
+							return `[${txt}](<${href.value}>)`;	// #6846
+						} else {
+							return `[${txt}](${href.value})`;
+						}
+					};
+
+					text += generateLink();
 				}
 				break;
 
diff --git a/test/mfm.ts b/test/mfm.ts
index a32457e89..0a120f96e 100644
--- a/test/mfm.ts
+++ b/test/mfm.ts
@@ -1167,6 +1167,10 @@ describe('fromHtml', () => {
 		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/b">c</a> d</p>'), 'a [c](https://example.com/b) d');
 	});
 
+	it('link with different text, but not encoded', () => {
+		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/ä">c</a> d</p>'), 'a [c](<https://example.com/ä>) d');
+	});
+
 	it('link with same text', () => {
 		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/b">https://example.com/b</a> d</p>'), 'a https://example.com/b d');
 	});
@@ -1183,6 +1187,14 @@ describe('fromHtml', () => {
 		assert.deepStrictEqual(fromHtml('<p>a <a>c</a> d</p>'), 'a c d');
 	});
 
+	it('link without text', () => {
+		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/b"></a> d</p>'), 'a https://example.com/b d');
+	});
+
+	it('link without both', () => {
+		assert.deepStrictEqual(fromHtml('<p>a <a></a> d</p>'), 'a  d');
+	});
+
 	it('mention', () => {
 		assert.deepStrictEqual(fromHtml('<p>a <a href="https://example.com/@user" class="u-url mention">@user</a> d</p>'), 'a @user@example.com d');
 	});