mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-04-28 08:46:44 +00:00
Rework markdown parser and improve its performance for non-HTML formats
This commit is contained in:
@@ -16,7 +16,7 @@ namespace DiscordChatExporter.Core.Markdown.Internal
|
||||
{
|
||||
}
|
||||
|
||||
public ParsedMatch<T> Match(string input, int startIndex, int length)
|
||||
public ParsedMatch<T> Match(StringPart stringPart)
|
||||
{
|
||||
ParsedMatch<T> earliestMatch = null;
|
||||
|
||||
@@ -24,19 +24,19 @@ namespace DiscordChatExporter.Core.Markdown.Internal
|
||||
foreach (var matcher in _matchers)
|
||||
{
|
||||
// Try to match
|
||||
var match = matcher.Match(input, startIndex, length);
|
||||
var match = matcher.Match(stringPart);
|
||||
|
||||
// If there's no match - continue
|
||||
if (match == null)
|
||||
continue;
|
||||
|
||||
// If this match is earlier than previous earliest - replace
|
||||
if (earliestMatch == null || match.StartIndex < earliestMatch.StartIndex)
|
||||
if (earliestMatch == null || match.StringPart.StartIndex < earliestMatch.StringPart.StartIndex)
|
||||
earliestMatch = match;
|
||||
|
||||
// If the earliest match starts at the very beginning - break,
|
||||
// because it's impossible to find a match earlier than that
|
||||
if (earliestMatch.StartIndex == startIndex)
|
||||
if (earliestMatch.StringPart.StartIndex == stringPart.StartIndex)
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,50 +1,54 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace DiscordChatExporter.Core.Markdown.Internal
|
||||
{
|
||||
internal static class Extensions
|
||||
{
|
||||
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
|
||||
int startIndex, int length, Func<string, T> fallbackTransform)
|
||||
{
|
||||
// Get end index for simplicity
|
||||
var endIndex = startIndex + length;
|
||||
public static StringPart Shrink(this StringPart stringPart, int newStartIndex, int newLength) =>
|
||||
new StringPart(stringPart.Target, newStartIndex, newLength);
|
||||
|
||||
public static StringPart Shrink(this StringPart stringPart, int newStartIndex) =>
|
||||
stringPart.Shrink(newStartIndex, stringPart.EndIndex - newStartIndex);
|
||||
|
||||
public static StringPart Shrink(this StringPart stringPart, Capture capture) =>
|
||||
stringPart.Shrink(capture.Index, capture.Length);
|
||||
|
||||
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, StringPart stringPart,
|
||||
Func<StringPart, T> fallbackTransform)
|
||||
{
|
||||
// Loop through segments divided by individual matches
|
||||
var currentIndex = startIndex;
|
||||
while (currentIndex < endIndex)
|
||||
var currentIndex = stringPart.StartIndex;
|
||||
while (currentIndex < stringPart.EndIndex)
|
||||
{
|
||||
// Find a match within this segment
|
||||
var match = matcher.Match(input, currentIndex, endIndex - currentIndex);
|
||||
var match = matcher.Match(stringPart.Shrink(currentIndex, stringPart.EndIndex - currentIndex));
|
||||
|
||||
// If there's no match - break
|
||||
if (match == null)
|
||||
break;
|
||||
|
||||
// If this match doesn't start immediately at current index - transform and yield fallback first
|
||||
if (match.StartIndex > currentIndex)
|
||||
if (match.StringPart.StartIndex > currentIndex)
|
||||
{
|
||||
var fallback = input.Substring(currentIndex, match.StartIndex - currentIndex);
|
||||
yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
|
||||
var fallbackPart = stringPart.Shrink(currentIndex, match.StringPart.StartIndex - currentIndex);
|
||||
yield return new ParsedMatch<T>(fallbackPart, fallbackTransform(fallbackPart));
|
||||
}
|
||||
|
||||
// Yield match
|
||||
yield return match;
|
||||
|
||||
// Shift current index to the end of the match
|
||||
currentIndex = match.StartIndex + match.Length;
|
||||
currentIndex = match.StringPart.StartIndex + match.StringPart.Length;
|
||||
}
|
||||
|
||||
// If EOL wasn't reached - transform and yield remaining part as fallback
|
||||
if (currentIndex < endIndex)
|
||||
if (currentIndex < stringPart.EndIndex)
|
||||
{
|
||||
var fallback = input.Substring(currentIndex);
|
||||
yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
|
||||
var fallbackPart = stringPart.Shrink(currentIndex);
|
||||
yield return new ParsedMatch<T>(fallbackPart, fallbackTransform(fallbackPart));
|
||||
}
|
||||
}
|
||||
|
||||
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
|
||||
Func<string, T> fallbackTransform) => matcher.MatchAll(input, 0, input.Length, fallbackTransform);
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,6 @@
|
||||
{
|
||||
internal interface IMatcher<T>
|
||||
{
|
||||
ParsedMatch<T> Match(string input, int startIndex, int length);
|
||||
ParsedMatch<T> Match(StringPart stringPart);
|
||||
}
|
||||
}
|
||||
@@ -2,16 +2,13 @@
|
||||
{
|
||||
internal class ParsedMatch<T>
|
||||
{
|
||||
public int StartIndex { get; }
|
||||
|
||||
public int Length { get; }
|
||||
public StringPart StringPart { get; }
|
||||
|
||||
public T Value { get; }
|
||||
|
||||
public ParsedMatch(int startIndex, int length, T value)
|
||||
public ParsedMatch(StringPart stringPart, T value)
|
||||
{
|
||||
StartIndex = startIndex;
|
||||
Length = length;
|
||||
StringPart = stringPart;
|
||||
Value = value;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Linq.Expressions;
|
||||
using System.Reflection;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace DiscordChatExporter.Core.Markdown.Internal
|
||||
@@ -6,18 +10,35 @@ namespace DiscordChatExporter.Core.Markdown.Internal
|
||||
internal class RegexMatcher<T> : IMatcher<T>
|
||||
{
|
||||
private readonly Regex _regex;
|
||||
private readonly Func<Match, T> _transform;
|
||||
private readonly Func<StringPart, Match, T> _transform;
|
||||
|
||||
public RegexMatcher(Regex regex, Func<Match, T> transform)
|
||||
public RegexMatcher(Regex regex, Func<StringPart, Match, T> transform)
|
||||
{
|
||||
_regex = regex;
|
||||
_transform = transform;
|
||||
}
|
||||
|
||||
public ParsedMatch<T> Match(string input, int startIndex, int length)
|
||||
public RegexMatcher(Regex regex, Func<Match, T> transform)
|
||||
: this(regex, (p, m) => transform(m))
|
||||
{
|
||||
var match = _regex.Match(input, startIndex, length);
|
||||
return match.Success ? new ParsedMatch<T>(match.Index, match.Length, _transform(match)) : null;
|
||||
}
|
||||
|
||||
public ParsedMatch<T> Match(StringPart stringPart)
|
||||
{
|
||||
var match = _regex.Match(stringPart.Target, stringPart.StartIndex, stringPart.Length);
|
||||
if (!match.Success)
|
||||
return null;
|
||||
|
||||
// Overload regex.Match(string, int, int) doesn't take the whole string into account,
|
||||
// it effectively functions as a match check on a substring.
|
||||
// Which is super weird because regex.Match(string, int) takes the whole input in context.
|
||||
// So in order to properly account for ^/$ regex tokens, we need to make sure that
|
||||
// the expression also matches on the bigger part of the input.
|
||||
if (!_regex.IsMatch(stringPart.Target.Substring(0, stringPart.EndIndex), stringPart.StartIndex))
|
||||
return null;
|
||||
|
||||
var stringPartShrunk = stringPart.Shrink(match.Index, match.Length);
|
||||
return new ParsedMatch<T>(stringPartShrunk, _transform(stringPartShrunk, match));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,24 +6,31 @@ namespace DiscordChatExporter.Core.Markdown.Internal
|
||||
{
|
||||
private readonly string _needle;
|
||||
private readonly StringComparison _comparison;
|
||||
private readonly Func<string, T> _transform;
|
||||
private readonly Func<StringPart, T> _transform;
|
||||
|
||||
public StringMatcher(string needle, StringComparison comparison, Func<string, T> transform)
|
||||
public StringMatcher(string needle, StringComparison comparison, Func<StringPart, T> transform)
|
||||
{
|
||||
_needle = needle;
|
||||
_comparison = comparison;
|
||||
_transform = transform;
|
||||
}
|
||||
|
||||
public StringMatcher(string needle, Func<string, T> transform)
|
||||
public StringMatcher(string needle, Func<StringPart, T> transform)
|
||||
: this(needle, StringComparison.Ordinal, transform)
|
||||
{
|
||||
}
|
||||
|
||||
public ParsedMatch<T> Match(string input, int startIndex, int length)
|
||||
public ParsedMatch<T> Match(StringPart stringPart)
|
||||
{
|
||||
var index = input.IndexOf(_needle, startIndex, length, _comparison);
|
||||
return index >= 0 ? new ParsedMatch<T>(index, _needle.Length, _transform(_needle)) : null;
|
||||
var index = stringPart.Target.IndexOf(_needle, stringPart.StartIndex, stringPart.Length, _comparison);
|
||||
|
||||
if (index >= 0)
|
||||
{
|
||||
var stringPartShrunk = stringPart.Shrink(index, _needle.Length);
|
||||
return new ParsedMatch<T>(stringPartShrunk, _transform(stringPartShrunk));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
28
DiscordChatExporter.Core.Markdown/Internal/StringPart.cs
Normal file
28
DiscordChatExporter.Core.Markdown/Internal/StringPart.cs
Normal file
@@ -0,0 +1,28 @@
|
||||
namespace DiscordChatExporter.Core.Markdown.Internal
|
||||
{
|
||||
internal class StringPart
|
||||
{
|
||||
public string Target { get; }
|
||||
|
||||
public int StartIndex { get; }
|
||||
|
||||
public int Length { get; }
|
||||
|
||||
public int EndIndex { get; }
|
||||
|
||||
public StringPart(string target, int startIndex, int length)
|
||||
{
|
||||
Target = target;
|
||||
StartIndex = startIndex;
|
||||
Length = length;
|
||||
EndIndex = startIndex + length;
|
||||
}
|
||||
|
||||
public StringPart(string target)
|
||||
: this(target, 0, target.Length)
|
||||
{
|
||||
}
|
||||
|
||||
public override string ToString() => Target.Substring(StartIndex, Length);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user