Rework markdown parser and improve its performance for non-HTML formats

This commit is contained in:
Alexey Golub
2019-09-15 21:24:07 +03:00
parent 533671c59f
commit cd042e5368
20 changed files with 201 additions and 139 deletions

View File

@@ -16,7 +16,7 @@ namespace DiscordChatExporter.Core.Markdown.Internal
{
}
public ParsedMatch<T> Match(string input, int startIndex, int length)
public ParsedMatch<T> Match(StringPart stringPart)
{
ParsedMatch<T> earliestMatch = null;
@@ -24,19 +24,19 @@ namespace DiscordChatExporter.Core.Markdown.Internal
foreach (var matcher in _matchers)
{
// Try to match
var match = matcher.Match(input, startIndex, length);
var match = matcher.Match(stringPart);
// If there's no match - continue
if (match == null)
continue;
// If this match is earlier than previous earliest - replace
if (earliestMatch == null || match.StartIndex < earliestMatch.StartIndex)
if (earliestMatch == null || match.StringPart.StartIndex < earliestMatch.StringPart.StartIndex)
earliestMatch = match;
// If the earliest match starts at the very beginning - break,
// because it's impossible to find a match earlier than that
if (earliestMatch.StartIndex == startIndex)
if (earliestMatch.StringPart.StartIndex == stringPart.StartIndex)
break;
}

View File

@@ -1,50 +1,54 @@
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace DiscordChatExporter.Core.Markdown.Internal
{
internal static class Extensions
{
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
int startIndex, int length, Func<string, T> fallbackTransform)
{
// Get end index for simplicity
var endIndex = startIndex + length;
public static StringPart Shrink(this StringPart stringPart, int newStartIndex, int newLength) =>
new StringPart(stringPart.Target, newStartIndex, newLength);
public static StringPart Shrink(this StringPart stringPart, int newStartIndex) =>
stringPart.Shrink(newStartIndex, stringPart.EndIndex - newStartIndex);
public static StringPart Shrink(this StringPart stringPart, Capture capture) =>
stringPart.Shrink(capture.Index, capture.Length);
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, StringPart stringPart,
Func<StringPart, T> fallbackTransform)
{
// Loop through segments divided by individual matches
var currentIndex = startIndex;
while (currentIndex < endIndex)
var currentIndex = stringPart.StartIndex;
while (currentIndex < stringPart.EndIndex)
{
// Find a match within this segment
var match = matcher.Match(input, currentIndex, endIndex - currentIndex);
var match = matcher.Match(stringPart.Shrink(currentIndex, stringPart.EndIndex - currentIndex));
// If there's no match - break
if (match == null)
break;
// If this match doesn't start immediately at current index - transform and yield fallback first
if (match.StartIndex > currentIndex)
if (match.StringPart.StartIndex > currentIndex)
{
var fallback = input.Substring(currentIndex, match.StartIndex - currentIndex);
yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
var fallbackPart = stringPart.Shrink(currentIndex, match.StringPart.StartIndex - currentIndex);
yield return new ParsedMatch<T>(fallbackPart, fallbackTransform(fallbackPart));
}
// Yield match
yield return match;
// Shift current index to the end of the match
currentIndex = match.StartIndex + match.Length;
currentIndex = match.StringPart.StartIndex + match.StringPart.Length;
}
// If EOL wasn't reached - transform and yield remaining part as fallback
if (currentIndex < endIndex)
if (currentIndex < stringPart.EndIndex)
{
var fallback = input.Substring(currentIndex);
yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
var fallbackPart = stringPart.Shrink(currentIndex);
yield return new ParsedMatch<T>(fallbackPart, fallbackTransform(fallbackPart));
}
}
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
Func<string, T> fallbackTransform) => matcher.MatchAll(input, 0, input.Length, fallbackTransform);
}
}

View File

@@ -2,6 +2,6 @@
{
internal interface IMatcher<T>
{
ParsedMatch<T> Match(string input, int startIndex, int length);
ParsedMatch<T> Match(StringPart stringPart);
}
}

View File

@@ -2,16 +2,13 @@
{
internal class ParsedMatch<T>
{
public int StartIndex { get; }
public int Length { get; }
public StringPart StringPart { get; }
public T Value { get; }
public ParsedMatch(int startIndex, int length, T value)
public ParsedMatch(StringPart stringPart, T value)
{
StartIndex = startIndex;
Length = length;
StringPart = stringPart;
Value = value;
}
}

View File

@@ -1,4 +1,8 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Linq.Expressions;
using System.Reflection;
using System.Text.RegularExpressions;
namespace DiscordChatExporter.Core.Markdown.Internal
@@ -6,18 +10,35 @@ namespace DiscordChatExporter.Core.Markdown.Internal
internal class RegexMatcher<T> : IMatcher<T>
{
private readonly Regex _regex;
private readonly Func<Match, T> _transform;
private readonly Func<StringPart, Match, T> _transform;
public RegexMatcher(Regex regex, Func<Match, T> transform)
public RegexMatcher(Regex regex, Func<StringPart, Match, T> transform)
{
_regex = regex;
_transform = transform;
}
public ParsedMatch<T> Match(string input, int startIndex, int length)
public RegexMatcher(Regex regex, Func<Match, T> transform)
: this(regex, (p, m) => transform(m))
{
var match = _regex.Match(input, startIndex, length);
return match.Success ? new ParsedMatch<T>(match.Index, match.Length, _transform(match)) : null;
}
public ParsedMatch<T> Match(StringPart stringPart)
{
var match = _regex.Match(stringPart.Target, stringPart.StartIndex, stringPart.Length);
if (!match.Success)
return null;
// Overload regex.Match(string, int, int) doesn't take the whole string into account,
// it effectively functions as a match check on a substring.
// Which is super weird because regex.Match(string, int) takes the whole input in context.
// So in order to properly account for ^/$ regex tokens, we need to make sure that
// the expression also matches on the bigger part of the input.
if (!_regex.IsMatch(stringPart.Target.Substring(0, stringPart.EndIndex), stringPart.StartIndex))
return null;
var stringPartShrunk = stringPart.Shrink(match.Index, match.Length);
return new ParsedMatch<T>(stringPartShrunk, _transform(stringPartShrunk, match));
}
}
}

View File

@@ -6,24 +6,31 @@ namespace DiscordChatExporter.Core.Markdown.Internal
{
private readonly string _needle;
private readonly StringComparison _comparison;
private readonly Func<string, T> _transform;
private readonly Func<StringPart, T> _transform;
public StringMatcher(string needle, StringComparison comparison, Func<string, T> transform)
public StringMatcher(string needle, StringComparison comparison, Func<StringPart, T> transform)
{
_needle = needle;
_comparison = comparison;
_transform = transform;
}
public StringMatcher(string needle, Func<string, T> transform)
public StringMatcher(string needle, Func<StringPart, T> transform)
: this(needle, StringComparison.Ordinal, transform)
{
}
public ParsedMatch<T> Match(string input, int startIndex, int length)
public ParsedMatch<T> Match(StringPart stringPart)
{
var index = input.IndexOf(_needle, startIndex, length, _comparison);
return index >= 0 ? new ParsedMatch<T>(index, _needle.Length, _transform(_needle)) : null;
var index = stringPart.Target.IndexOf(_needle, stringPart.StartIndex, stringPart.Length, _comparison);
if (index >= 0)
{
var stringPartShrunk = stringPart.Shrink(index, _needle.Length);
return new ParsedMatch<T>(stringPartShrunk, _transform(stringPartShrunk));
}
return null;
}
}
}

View File

@@ -0,0 +1,28 @@
namespace DiscordChatExporter.Core.Markdown.Internal
{
internal class StringPart
{
public string Target { get; }
public int StartIndex { get; }
public int Length { get; }
public int EndIndex { get; }
public StringPart(string target, int startIndex, int length)
{
Target = target;
StartIndex = startIndex;
Length = length;
EndIndex = startIndex + length;
}
public StringPart(string target)
: this(target, 0, target.Length)
{
}
public override string ToString() => Target.Substring(StartIndex, Length);
}
}