mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-02-23 09:44:15 +00:00
Implement a more sophisticated markdown parsing engine (#145)
This commit is contained in:
@@ -0,0 +1,12 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net461</TargetFramework>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Sprache" Version="2.2.0" />
|
||||
<PackageReference Include="Tyrrrz.Extensions" Version="1.5.1" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
21
DiscordChatExporter.Core.Markdown/EmojiNode.cs
Normal file
21
DiscordChatExporter.Core.Markdown/EmojiNode.cs
Normal file
@@ -0,0 +1,21 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public class EmojiNode : Node
|
||||
{
|
||||
public string Id { get; }
|
||||
|
||||
public string Name { get; }
|
||||
|
||||
public bool IsAnimated { get; }
|
||||
|
||||
public EmojiNode(string lexeme, string id, string name, bool isAnimated)
|
||||
: base(lexeme)
|
||||
{
|
||||
Id = id;
|
||||
Name = name;
|
||||
IsAnimated = isAnimated;
|
||||
}
|
||||
|
||||
public override string ToString() => $"<Emoji> {Name}";
|
||||
}
|
||||
}
|
||||
23
DiscordChatExporter.Core.Markdown/FormattedNode.cs
Normal file
23
DiscordChatExporter.Core.Markdown/FormattedNode.cs
Normal file
@@ -0,0 +1,23 @@
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public class FormattedNode : Node
|
||||
{
|
||||
public string Token { get; }
|
||||
|
||||
public TextFormatting Formatting { get; }
|
||||
|
||||
public IReadOnlyList<Node> Children { get; }
|
||||
|
||||
public FormattedNode(string lexeme, string token, TextFormatting formatting, IReadOnlyList<Node> children)
|
||||
: base(lexeme)
|
||||
{
|
||||
Token = token;
|
||||
Formatting = formatting;
|
||||
Children = children;
|
||||
}
|
||||
|
||||
public override string ToString() => $"<{Formatting}> ({Children.Count} direct children)";
|
||||
}
|
||||
}
|
||||
15
DiscordChatExporter.Core.Markdown/InlineCodeBlockNode.cs
Normal file
15
DiscordChatExporter.Core.Markdown/InlineCodeBlockNode.cs
Normal file
@@ -0,0 +1,15 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public class InlineCodeBlockNode : Node
|
||||
{
|
||||
public string Code { get; }
|
||||
|
||||
public InlineCodeBlockNode(string lexeme, string code)
|
||||
: base(lexeme)
|
||||
{
|
||||
Code = code;
|
||||
}
|
||||
|
||||
public override string ToString() => $"<Code> {Code}";
|
||||
}
|
||||
}
|
||||
157
DiscordChatExporter.Core.Markdown/Internal/Grammar.cs
Normal file
157
DiscordChatExporter.Core.Markdown/Internal/Grammar.cs
Normal file
@@ -0,0 +1,157 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text.RegularExpressions;
|
||||
using Sprache;
|
||||
using Tyrrrz.Extensions;
|
||||
|
||||
namespace DiscordChatExporter.Core.Markdown.Internal
|
||||
{
|
||||
// The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
|
||||
internal static class Grammar
|
||||
{
|
||||
/* Formatting */
|
||||
|
||||
// Capture until the earliest double asterisk not followed by an asterisk
|
||||
private static readonly Parser<Node> BoldFormattedNode =
|
||||
Parse.RegexMatch(new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", RegexOptions.Singleline))
|
||||
.Select(m => new FormattedNode(m.Value, "**", TextFormatting.Bold, BuildTree(m.Groups[1].Value)));
|
||||
|
||||
// Capture until the earliest single asterisk not preceded or followed by an asterisk
|
||||
// Can't have whitespace right after opening or right before closing asterisk
|
||||
private static readonly Parser<Node> ItalicFormattedNode =
|
||||
Parse.RegexMatch(new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", RegexOptions.Singleline))
|
||||
.Select(m => new FormattedNode(m.Value, "*", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
|
||||
|
||||
// Can't have underscores inside
|
||||
// Can't have word characters right after closing underscore
|
||||
private static readonly Parser<Node> ItalicAltFormattedNode =
|
||||
Parse.RegexMatch(new Regex("_([^_]+?)_(?!\\w)", RegexOptions.Singleline))
|
||||
.Select(m => new FormattedNode(m.Value, "_", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
|
||||
|
||||
// Treated as a separate entity for simplicity
|
||||
// Capture until the earliest triple asterisk not preceded or followed by an asterisk
|
||||
private static readonly Parser<Node> ItalicBoldFormattedNode =
|
||||
Parse.RegexMatch(new Regex("\\*(\\*\\*(?:.+?)\\*\\*)\\*(?!\\*)", RegexOptions.Singleline))
|
||||
.Select(m => new FormattedNode(m.Value, "*", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
|
||||
|
||||
// Capture until the earliest double underscore not followed by an underscore
|
||||
private static readonly Parser<Node> UnderlineFormattedNode =
|
||||
Parse.RegexMatch(new Regex("__(.+?)__(?!_)", RegexOptions.Singleline))
|
||||
.Select(m => new FormattedNode(m.Value, "__", TextFormatting.Underline, BuildTree(m.Groups[1].Value)));
|
||||
|
||||
// Treated as a separate entity for simplicity
|
||||
// Capture until the earliest triple underscore not preceded or followed by an underscore
|
||||
private static readonly Parser<Node> ItalicUnderlineFormattedNode =
|
||||
Parse.RegexMatch(new Regex("_(__(?:.+?)__)_(?!_)", RegexOptions.Singleline))
|
||||
.Select(m => new FormattedNode(m.Value, "_", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
|
||||
|
||||
// Strikethrough is safe
|
||||
private static readonly Parser<Node> StrikethroughFormattedNode =
|
||||
Parse.RegexMatch(new Regex("~~(.+?)~~", RegexOptions.Singleline))
|
||||
.Select(m => new FormattedNode(m.Value, "~~", TextFormatting.Strikethrough, BuildTree(m.Groups[1].Value)));
|
||||
|
||||
// Spoiler is safe
|
||||
private static readonly Parser<Node> SpoilerFormattedNode =
|
||||
Parse.RegexMatch(new Regex("\\|\\|(.+?)\\|\\|", RegexOptions.Singleline))
|
||||
.Select(m => new FormattedNode(m.Value, "||", TextFormatting.Spoiler, BuildTree(m.Groups[1].Value)));
|
||||
|
||||
// Aggregator, order matters
|
||||
private static readonly Parser<Node> AnyFormattedNode =
|
||||
ItalicBoldFormattedNode.Or(ItalicUnderlineFormattedNode)
|
||||
.Or(BoldFormattedNode).Or(ItalicFormattedNode)
|
||||
.Or(UnderlineFormattedNode).Or(ItalicAltFormattedNode)
|
||||
.Or(StrikethroughFormattedNode).Or(SpoilerFormattedNode);
|
||||
|
||||
/* Code blocks */
|
||||
|
||||
// Can't have backticks inside and surrounding whitespace is trimmed
|
||||
private static readonly Parser<Node> InlineCodeBlockNode =
|
||||
Parse.RegexMatch(new Regex("`\\s*([^`]+?)\\s*`", RegexOptions.Singleline))
|
||||
.Select(m => new InlineCodeBlockNode(m.Value, m.Groups[1].Value));
|
||||
|
||||
// The first word is a language identifier if it's the only word followed by a newline, the rest is code
|
||||
private static readonly Parser<Node> MultilineCodeBlockNode =
|
||||
Parse.RegexMatch(new Regex("```(?:(\\w*?)?(?:\\s*?\\n))?(.+)```", RegexOptions.Singleline))
|
||||
.Select(m => new MultilineCodeBlockNode(m.Value, m.Groups[1].Value, m.Groups[2].Value));
|
||||
|
||||
// Aggregator, order matters
|
||||
private static readonly Parser<Node> AnyCodeBlockNode = MultilineCodeBlockNode.Or(InlineCodeBlockNode);
|
||||
|
||||
/* Mentions */
|
||||
|
||||
// @everyone or @here
|
||||
private static readonly Parser<Node> MetaMentionNode = Parse.RegexMatch("@(everyone|here)")
|
||||
.Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Meta));
|
||||
|
||||
// <@123456> or <@!123456>
|
||||
private static readonly Parser<Node> UserMentionNode = Parse.RegexMatch("<@!?(\\d+)>")
|
||||
.Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.User));
|
||||
|
||||
// <#123456>
|
||||
private static readonly Parser<Node> ChannelMentionNode = Parse.RegexMatch("<#(\\d+)>")
|
||||
.Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Channel));
|
||||
|
||||
// <@&123456>
|
||||
private static readonly Parser<Node> RoleMentionNode = Parse.RegexMatch("<@&(\\d+)>")
|
||||
.Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Role));
|
||||
|
||||
// Aggregator, order matters
|
||||
private static readonly Parser<Node> AnyMentionNode =
|
||||
MetaMentionNode.Or(UserMentionNode).Or(ChannelMentionNode).Or(RoleMentionNode);
|
||||
|
||||
/* Emojis */
|
||||
|
||||
// <:lul:123456> or <a:lul:123456>
|
||||
private static readonly Parser<Node> EmojiNode = Parse.RegexMatch("<(a)?:(.+):(\\d+)>")
|
||||
.Select(m => new EmojiNode(m.Value, m.Groups[3].Value, m.Groups[2].Value, m.Groups[1].Value.IsNotBlank()));
|
||||
|
||||
// Aggregator, order matters
|
||||
private static readonly Parser<Node> AnyEmojiNode = EmojiNode;
|
||||
|
||||
/* Links */
|
||||
|
||||
// [title](link)
|
||||
private static readonly Parser<Node> TitledLinkNode = Parse.RegexMatch("\\[(.+)\\]\\((.+)\\)")
|
||||
.Select(m => new LinkNode(m.Value, m.Groups[2].Value, m.Groups[1].Value));
|
||||
|
||||
// Starts with http:// or https://, stops at the last non-whitespace character followed by whitespace or punctuation character
|
||||
private static readonly Parser<Node> AutoLinkNode = Parse.RegexMatch("(https?://\\S*[^\\.,:;\"\'\\s])")
|
||||
.Select(m => new LinkNode(m.Value, m.Groups[1].Value));
|
||||
|
||||
// Autolink surrounded by angular brackets
|
||||
private static readonly Parser<Node> HiddenLinkNode = Parse.RegexMatch("<(https?://\\S*[^\\.,:;\"\'\\s])>")
|
||||
.Select(m => new LinkNode(m.Value, m.Groups[1].Value));
|
||||
|
||||
// Aggregator, order matters
|
||||
private static readonly Parser<Node> AnyLinkNode = TitledLinkNode.Or(HiddenLinkNode).Or(AutoLinkNode);
|
||||
|
||||
/* Text */
|
||||
|
||||
// Shrug is an exception and needs to be exempt from formatting
|
||||
private static readonly Parser<Node> ShrugTextNode =
|
||||
Parse.String("¯\\_(ツ)_/¯").Text().Select(s => new TextNode(s));
|
||||
|
||||
// Backslash escapes any following non-whitespace character except for digits and latin letters
|
||||
private static readonly Parser<Node> EscapedTextNode =
|
||||
Parse.RegexMatch("\\\\([^a-zA-Z0-9\\s])").Select(m => new TextNode(m.Value, m.Groups[1].Value));
|
||||
|
||||
// Aggregator, order matters
|
||||
private static readonly Parser<Node> AnyTextNode = ShrugTextNode.Or(EscapedTextNode);
|
||||
|
||||
/* Aggregator and fallback */
|
||||
|
||||
// Any node recognized by above patterns
|
||||
private static readonly Parser<Node> AnyRecognizedNode = AnyFormattedNode.Or(AnyCodeBlockNode)
|
||||
.Or(AnyMentionNode).Or(AnyEmojiNode).Or(AnyLinkNode).Or(AnyTextNode);
|
||||
|
||||
// Any node not recognized by above patterns (treated as plain text)
|
||||
private static readonly Parser<Node> FallbackNode =
|
||||
Parse.AnyChar.Except(AnyRecognizedNode).AtLeastOnce().Text().Select(s => new TextNode(s));
|
||||
|
||||
// Any node
|
||||
private static readonly Parser<Node> AnyNode = AnyRecognizedNode.Or(FallbackNode);
|
||||
|
||||
// Entry point
|
||||
public static IReadOnlyList<Node> BuildTree(string input) => AnyNode.Many().Parse(input).ToArray();
|
||||
}
|
||||
}
|
||||
22
DiscordChatExporter.Core.Markdown/LinkNode.cs
Normal file
22
DiscordChatExporter.Core.Markdown/LinkNode.cs
Normal file
@@ -0,0 +1,22 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public class LinkNode : Node
|
||||
{
|
||||
public string Url { get; }
|
||||
|
||||
public string Title { get; }
|
||||
|
||||
public LinkNode(string lexeme, string url, string title)
|
||||
: base(lexeme)
|
||||
{
|
||||
Url = url;
|
||||
Title = title;
|
||||
}
|
||||
|
||||
public LinkNode(string lexeme, string url) : this(lexeme, url, url)
|
||||
{
|
||||
}
|
||||
|
||||
public override string ToString() => $"<Link> {Title}";
|
||||
}
|
||||
}
|
||||
10
DiscordChatExporter.Core.Markdown/MarkdownParser.cs
Normal file
10
DiscordChatExporter.Core.Markdown/MarkdownParser.cs
Normal file
@@ -0,0 +1,10 @@
|
||||
using System.Collections.Generic;
|
||||
using DiscordChatExporter.Core.Markdown.Internal;
|
||||
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public static class MarkdownParser
|
||||
{
|
||||
public static IReadOnlyList<Node> Parse(string input) => Grammar.BuildTree(input);
|
||||
}
|
||||
}
|
||||
18
DiscordChatExporter.Core.Markdown/MentionNode.cs
Normal file
18
DiscordChatExporter.Core.Markdown/MentionNode.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public class MentionNode : Node
|
||||
{
|
||||
public string Id { get; }
|
||||
|
||||
public MentionType Type { get; }
|
||||
|
||||
public MentionNode(string lexeme, string id, MentionType type)
|
||||
: base(lexeme)
|
||||
{
|
||||
Id = id;
|
||||
Type = type;
|
||||
}
|
||||
|
||||
public override string ToString() => $"<{Type} mention> {Id}";
|
||||
}
|
||||
}
|
||||
10
DiscordChatExporter.Core.Markdown/MentionType.cs
Normal file
10
DiscordChatExporter.Core.Markdown/MentionType.cs
Normal file
@@ -0,0 +1,10 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public enum MentionType
|
||||
{
|
||||
Meta,
|
||||
User,
|
||||
Channel,
|
||||
Role
|
||||
}
|
||||
}
|
||||
18
DiscordChatExporter.Core.Markdown/MultilineCodeBlockNode.cs
Normal file
18
DiscordChatExporter.Core.Markdown/MultilineCodeBlockNode.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public class MultilineCodeBlockNode : Node
|
||||
{
|
||||
public string Language { get; }
|
||||
|
||||
public string Code { get; }
|
||||
|
||||
public MultilineCodeBlockNode(string lexeme, string language, string code)
|
||||
: base(lexeme)
|
||||
{
|
||||
Language = language;
|
||||
Code = code;
|
||||
}
|
||||
|
||||
public override string ToString() => $"<Code [{Language}]> {Code}";
|
||||
}
|
||||
}
|
||||
12
DiscordChatExporter.Core.Markdown/Node.cs
Normal file
12
DiscordChatExporter.Core.Markdown/Node.cs
Normal file
@@ -0,0 +1,12 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public abstract class Node
|
||||
{
|
||||
public string Lexeme { get; }
|
||||
|
||||
protected Node(string lexeme)
|
||||
{
|
||||
Lexeme = lexeme;
|
||||
}
|
||||
}
|
||||
}
|
||||
11
DiscordChatExporter.Core.Markdown/TextFormatting.cs
Normal file
11
DiscordChatExporter.Core.Markdown/TextFormatting.cs
Normal file
@@ -0,0 +1,11 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public enum TextFormatting
|
||||
{
|
||||
Bold,
|
||||
Italic,
|
||||
Underline,
|
||||
Strikethrough,
|
||||
Spoiler
|
||||
}
|
||||
}
|
||||
19
DiscordChatExporter.Core.Markdown/TextNode.cs
Normal file
19
DiscordChatExporter.Core.Markdown/TextNode.cs
Normal file
@@ -0,0 +1,19 @@
|
||||
namespace DiscordChatExporter.Core.Markdown
|
||||
{
|
||||
public class TextNode : Node
|
||||
{
|
||||
public string Text { get; }
|
||||
|
||||
public TextNode(string lexeme, string text)
|
||||
: base(lexeme)
|
||||
{
|
||||
Text = text;
|
||||
}
|
||||
|
||||
public TextNode(string text) : this(text, text)
|
||||
{
|
||||
}
|
||||
|
||||
public override string ToString() => Text;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user