using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using DiscordChatExporter.Domain.Markdown.Ast; using DiscordChatExporter.Domain.Markdown.Matching; namespace DiscordChatExporter.Domain.Markdown { // The following parsing logic is meant to replicate Discord's markdown grammar as close as possible internal static partial class MarkdownParser { private const RegexOptions DefaultRegexOptions = RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.Multiline; /* Formatting */ // Capture any character until the earliest double asterisk not followed by an asterisk private static readonly IMatcher BoldFormattedNodeMatcher = new RegexMatcher( new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), (p, m) => new FormattedNode(TextFormatting.Bold, Parse(p.Slice(m.Groups[1])))); // Capture any character until the earliest single asterisk not preceded or followed by an asterisk // Opening asterisk must not be followed by whitespace // Closing asterisk must not be preceded by whitespace private static readonly IMatcher ItalicFormattedNodeMatcher = new RegexMatcher( new Regex("\\*(?!\\s)(.+?)(? new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1])))); // Capture any character until the earliest triple asterisk not followed by an asterisk private static readonly IMatcher ItalicBoldFormattedNodeMatcher = new RegexMatcher( new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]), BoldFormattedNodeMatcher))); // Capture any character except underscore until an underscore // Closing underscore must not be followed by a word character private static readonly IMatcher ItalicAltFormattedNodeMatcher = new RegexMatcher( new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline), (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1])))); // Capture any character until the earliest double underscore not followed by an underscore private static readonly IMatcher UnderlineFormattedNodeMatcher = new RegexMatcher( new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), (p, m) => new FormattedNode(TextFormatting.Underline, Parse(p.Slice(m.Groups[1])))); // Capture any character until the earliest triple underscore not followed by an underscore private static readonly IMatcher ItalicUnderlineFormattedNodeMatcher = new RegexMatcher( new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Slice(m.Groups[1]), UnderlineFormattedNodeMatcher))); // Capture any character until the earliest double tilde private static readonly IMatcher StrikethroughFormattedNodeMatcher = new RegexMatcher( new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), (p, m) => new FormattedNode(TextFormatting.Strikethrough, Parse(p.Slice(m.Groups[1])))); // Capture any character until the earliest double pipe private static readonly IMatcher SpoilerFormattedNodeMatcher = new RegexMatcher( new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline), (p, m) => new FormattedNode(TextFormatting.Spoiler, Parse(p.Slice(m.Groups[1])))); // Capture any character until the end of the line // Opening 'greater than' character must be followed by whitespace private static readonly IMatcher SingleLineQuoteNodeMatcher = new RegexMatcher( new Regex("^>\\s(.+\n?)", DefaultRegexOptions), (p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1])))); // Repeatedly capture any character until the end of the line // This one is tricky as it ends up producing multiple separate captures which need to be joined private static readonly IMatcher RepeatedSingleLineQuoteNodeMatcher = new RegexMatcher( new Regex("(?:^>\\s(.+\n?)){2,}", DefaultRegexOptions), (p, m) => { var content = string.Concat(m.Groups[1].Captures.Select(c => c.Value)); return new FormattedNode(TextFormatting.Quote, Parse(content)); }); // Capture any character until the end of the input // Opening 'greater than' characters must be followed by whitespace private static readonly IMatcher MultiLineQuoteNodeMatcher = new RegexMatcher( new Regex("^>>>\\s(.+)", DefaultRegexOptions | RegexOptions.Singleline), (p, m) => new FormattedNode(TextFormatting.Quote, Parse(p.Slice(m.Groups[1])))); /* Code blocks */ // Capture any character except backtick until a backtick // Blank lines at the beginning and end of content are trimmed // There can be either one or two backticks, but equal number on both sides private static readonly IMatcher InlineCodeBlockNodeMatcher = new RegexMatcher( new Regex("(`{1,2})([^`]+)\\1", DefaultRegexOptions | RegexOptions.Singleline), m => new InlineCodeBlockNode(m.Groups[2].Value.Trim('\r', '\n'))); // Capture language identifier and then any character until the earliest triple backtick // Language identifier is one word immediately after opening backticks, followed immediately by newline // Blank lines at the beginning and end of content are trimmed private static readonly IMatcher MultiLineCodeBlockNodeMatcher = new RegexMatcher( new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline), m => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n'))); /* Mentions */ // Capture @everyone private static readonly IMatcher EveryoneMentionNodeMatcher = new StringMatcher( "@everyone", p => new MentionNode("everyone", MentionType.Meta)); // Capture @here private static readonly IMatcher HereMentionNodeMatcher = new StringMatcher( "@here", p => new MentionNode("here", MentionType.Meta)); // Capture <@123456> or <@!123456> private static readonly IMatcher UserMentionNodeMatcher = new RegexMatcher( new Regex("<@!?(\\d+)>", DefaultRegexOptions), m => new MentionNode(m.Groups[1].Value, MentionType.User)); // Capture <#123456> private static readonly IMatcher ChannelMentionNodeMatcher = new RegexMatcher( new Regex("<#(\\d+)>", DefaultRegexOptions), m => new MentionNode(m.Groups[1].Value, MentionType.Channel)); // Capture <@&123456> private static readonly IMatcher RoleMentionNodeMatcher = new RegexMatcher( new Regex("<@&(\\d+)>", DefaultRegexOptions), m => new MentionNode(m.Groups[1].Value, MentionType.Role)); /* Emojis */ // Capture any country flag emoji (two regional indicator surrogate pairs) // ... or "miscellaneous symbol" character // ... or surrogate pair // ... or digit followed by enclosing mark // (this does not match all emojis in Discord but it's reasonably accurate enough) private static readonly IMatcher StandardEmojiNodeMatcher = new RegexMatcher( new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|[\\u2600-\\u26FF]|\\p{Cs}{2}|\\d\\p{Me})", DefaultRegexOptions), m => new EmojiNode(m.Groups[1].Value)); // Capture <:lul:123456> or private static readonly IMatcher CustomEmojiNodeMatcher = new RegexMatcher( new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions), m => new EmojiNode(m.Groups[3].Value, m.Groups[2].Value, !string.IsNullOrWhiteSpace(m.Groups[1].Value))); /* Links */ // Capture [title](link) private static readonly IMatcher TitledLinkNodeMatcher = new RegexMatcher( new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions), m => new LinkNode(m.Groups[2].Value, m.Groups[1].Value)); // Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace private static readonly IMatcher AutoLinkNodeMatcher = new RegexMatcher( new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions), m => new LinkNode(m.Groups[1].Value)); // Same as auto link but also surrounded by angular brackets private static readonly IMatcher HiddenLinkNodeMatcher = new RegexMatcher( new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions), m => new LinkNode(m.Groups[1].Value)); /* Text */ // Capture the shrug emoticon // This escapes it from matching for formatting private static readonly IMatcher ShrugTextNodeMatcher = new StringMatcher( @"¯\_(ツ)_/¯", p => new TextNode(p.ToString())); // Capture some specific emojis that don't get rendered // This escapes it from matching for emoji private static readonly IMatcher IgnoredEmojiTextNodeMatcher = new RegexMatcher( new Regex("(\\u26A7|\\u2640|\\u2642|\\u2695|\\u267E|\\u00A9|\\u00AE|\\u2122)", DefaultRegexOptions), m => new TextNode(m.Groups[1].Value)); // Capture any "symbol/other" character or surrogate pair preceded by a backslash // This escapes it from matching for emoji private static readonly IMatcher EscapedSymbolTextNodeMatcher = new RegexMatcher( new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions), m => new TextNode(m.Groups[1].Value)); // Capture any non-whitespace, non latin alphanumeric character preceded by a backslash // This escapes it from matching for formatting or other tokens private static readonly IMatcher EscapedCharacterTextNodeMatcher = new RegexMatcher( new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions), m => new TextNode(m.Groups[1].Value)); // Combine all matchers into one // Matchers that have similar patterns are ordered from most specific to least specific private static readonly IMatcher AggregateNodeMatcher = new AggregateMatcher( // Escaped text ShrugTextNodeMatcher, IgnoredEmojiTextNodeMatcher, EscapedSymbolTextNodeMatcher, EscapedCharacterTextNodeMatcher, // Formatting ItalicBoldFormattedNodeMatcher, ItalicUnderlineFormattedNodeMatcher, BoldFormattedNodeMatcher, ItalicFormattedNodeMatcher, UnderlineFormattedNodeMatcher, ItalicAltFormattedNodeMatcher, StrikethroughFormattedNodeMatcher, SpoilerFormattedNodeMatcher, MultiLineQuoteNodeMatcher, RepeatedSingleLineQuoteNodeMatcher, SingleLineQuoteNodeMatcher, // Code blocks MultiLineCodeBlockNodeMatcher, InlineCodeBlockNodeMatcher, // Mentions EveryoneMentionNodeMatcher, HereMentionNodeMatcher, UserMentionNodeMatcher, ChannelMentionNodeMatcher, RoleMentionNodeMatcher, // Links TitledLinkNodeMatcher, AutoLinkNodeMatcher, HiddenLinkNodeMatcher, // Emoji StandardEmojiNodeMatcher, CustomEmojiNodeMatcher ); // Minimal set of matchers for non-multimedia formats (e.g. plain text) private static readonly IMatcher MinimalAggregateNodeMatcher = new AggregateMatcher( // Mentions EveryoneMentionNodeMatcher, HereMentionNodeMatcher, UserMentionNodeMatcher, ChannelMentionNodeMatcher, RoleMentionNodeMatcher, // Emoji CustomEmojiNodeMatcher ); private static IReadOnlyList Parse(StringPart stringPart, IMatcher matcher) => matcher .MatchAll(stringPart, p => new TextNode(p.ToString())) .Select(r => r.Value) .ToArray(); } internal static partial class MarkdownParser { private static IReadOnlyList Parse(StringPart stringPart) => Parse(stringPart, AggregateNodeMatcher); private static IReadOnlyList ParseMinimal(StringPart stringPart) => Parse(stringPart, MinimalAggregateNodeMatcher); public static IReadOnlyList Parse(string input) => Parse(new StringPart(input)); public static IReadOnlyList ParseMinimal(string input) => ParseMinimal(new StringPart(input)); } }