Skip to content

Commit

Permalink
fast CSV
Browse files Browse the repository at this point in the history
  • Loading branch information
olmobrutall committed Oct 28, 2024
1 parent 9f74924 commit e17aeb8
Showing 1 changed file with 113 additions and 95 deletions.
208 changes: 113 additions & 95 deletions Signum.Utilities/Csv.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
using System.Collections.Concurrent;
using System.Collections;
using System.IO.Pipes;
using System;
using System.ComponentModel.Design.Serialization;

namespace Signum.Utilities;

Expand Down Expand Up @@ -181,8 +183,7 @@ public static IEnumerable<T> ReadStream<T>(Stream stream, Encoding? encoding = n

var members = CsvMemberCache<T>.Members;
var parsers = members.Select(m => GetParser(defCulture, m, defOptions.ParserFactory)).ToList();

Regex regex = GetRegex(defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);
Regex valueRegex = GetRegex(isLine: false, defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);

if (defOptions.AsumeSingleLine)
{
Expand All @@ -199,66 +200,68 @@ public static IEnumerable<T> ReadStream<T>(Stream stream, Encoding? encoding = n
if (csvLine == null)
yield break;

Match? m = null;
T? t = null;
try
if (csvLine.Length > 0)
{
m = regex.Match(csvLine);
if (m.Length > 0)
T? t = null;
try
{
t = ReadObject<T>(m, members, parsers);
var m = valueRegex.EnumerateMatches(csvLine);

t = ReadObject<T>(m, csvLine.AsSpan(), members, parsers);
}
}
catch (Exception e)
{
e.Data["row"] = line;
catch (Exception e)
{
e.Data["row"] = line;

if (defOptions.SkipError == null || !defOptions.SkipError(e, m))
throw new ParseCsvException(e);
}
if (defOptions.SkipError == null || !defOptions.SkipError(e, csvLine))
throw new ParseCsvException(e);
}

if (t != null)
yield return t;
if (t != null)
yield return t;

}
line++;
}
}
}
else
{
Regex lineRegex = GetRegex(isLine: true, defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);

using (StreamReader sr = new StreamReader(stream, encoding))
{
string str = sr.ReadToEnd();

var matches = regex.Matches(str).Cast<Match>();

if (skipLines > 0)
matches = matches.Skip(skipLines);

int line = skipLines;
foreach (var m in matches)
int i = 0;
foreach (Match m in lineRegex.Matches(str))
{
if (i < skipLines)
continue;

if (m.Length > 0)
{
T? t = null;
try
{
var line = m.Value;

if (options?.Constructor != null)
t = options.Constructor(m);
t = options.Constructor(line);
else
t = ReadObject<T>(m, members, parsers);
t = ReadObject<T>(valueRegex.EnumerateMatches(line), line, members, parsers);
}
catch (Exception e)
{
e.Data["row"] = line;
e.Data["row"] = i;

if (defOptions.SkipError == null || !defOptions.SkipError(e, m))
if (defOptions.SkipError == null || !defOptions.SkipError(e, str.Substring(m.Index, m.Length)))
throw new ParseCsvException(e);
}
if (t != null)
yield return t;
}
line++;
i++;
}
}
}
Expand All @@ -271,18 +274,20 @@ public static T ReadLine<T>(string csvLine, CultureInfo? culture = null, CsvRead

var defCulture = GetDefaultCulture(culture);

Regex regex = GetRegex(defCulture, defOptions.RegexTimeout);
Regex regex = GetRegex(isLine: false, defCulture, defOptions.RegexTimeout);

Match m = regex.Match(csvLine);
var vme = regex.EnumerateMatches(csvLine);

var members = CsvMemberCache<T>.Members;

return ReadObject<T>(m,
return ReadObject<T>(vme,
csvLine.AsSpan(),
members,
members.Select(c => GetParser(defCulture, c, defOptions.ParserFactory)).ToList());
}

private static Func<string, object?> GetParser<T>(CultureInfo culture, CsvMemberInfo<T> column, Func<CsvMemberInfo<T>, CultureInfo, Func<string, object?>?>? parserFactory)

private static ValueParser GetParser<T>(CultureInfo culture, CsvMemberInfo<T> column, Func<CsvMemberInfo<T>, CultureInfo, ValueParser?>? parserFactory)
{
if (parserFactory != null)
{
Expand All @@ -294,43 +299,51 @@ public static T ReadLine<T>(string csvLine, CultureInfo? culture = null, CsvRead

var type = column.IsCollection ? column.MemberInfo.ReturningType().ElementType()! : column.MemberInfo.ReturningType();

return str => ConvertTo(str, type, culture, column.Format);
return GetBasicParser(type.UnNullify(), culture, column.Format);
}

static T ReadObject<T>(Match m, List<CsvMemberInfo<T>> members, List<Func<string, object?>> parsers)
{
var vals = m.Groups["val"].Captures;

if (vals.Count < members.Count)
throw new FormatException("Only {0} columns found (instead of {1}) in line: {2}".FormatWith(vals.Count, members.Count, m.Value));
public delegate object? ValueParser(ReadOnlySpan<char> str);

static T ReadObject<T>(Regex.ValueMatchEnumerator vme, ReadOnlySpan<char> line, List<CsvMemberInfo<T>> members, List<ValueParser> parsers)
{
T t = Activator.CreateInstance<T>();

for (int i = 0; i < members.Count; i++)
bool endsInCollection = false;
int i = 0;
foreach (var v in vme)
{
if (members.Count <= i)
continue;

var value = line.Slice(v.Index, v.Length);
var member = members[i];
var parser = parsers[i];
string? str = null;
try
{
if (!member.IsCollection)
{
str = DecodeCsv(vals[i].Value);
value = DecodeCsv(value);

object? val = parser(str);
object? val = parser(value);

member.MemberEntry.Setter!(t, val);
}
else
{
if (i != members.Count - 1)
throw new InvalidOperationException($"Collection {member.MemberInfo} should be the last member");
endsInCollection = true;
var list = (IList)Activator.CreateInstance(member.MemberInfo.ReturningType())!;

for (int j = i; j < vals.Count; j++)
{
str = DecodeCsv(vals[j].Value);

object? val = parser(str);
value = DecodeCsv(value);
object? val = parser(value);
list.Add(val);

foreach (var v2 in vme)
{
value = line.Slice(v2.Index, v2.Length);
value = DecodeCsv(value);
val = parser(value);
list.Add(val);
}

Expand All @@ -339,11 +352,17 @@ static T ReadObject<T>(Match m, List<CsvMemberInfo<T>> members, List<Func<string
}
catch (Exception e)
{
e.Data["value"] = str;
e.Data["value"] = new String(value);
e.Data["member"] = members[i].MemberInfo.Name;
throw;
}

i++;
}

if (!endsInCollection && i != members.Count)
throw new FormatException("Only {0} columns found (instead of {1}) in line: {2}".FormatWith(i, members.Count, new string(line)));

return t;
}

Expand All @@ -369,7 +388,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
var defCulture = GetDefaultCulture(culture);
var defOptions = options ?? new CsvReadOptions();

Regex regex = GetRegex(defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);
Regex valueRegex = GetRegex(false, defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);
if (defOptions.AsumeSingleLine)
{
using (StreamReader sr = new StreamReader(stream, encoding))
Expand All @@ -386,7 +405,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
string[]? t = null;
try
{
m = regex.Match(csvLine);
m = valueRegex.Match(csvLine);
if (m.Length > 0)
{
t = m.Groups["val"].Captures.Select(c => c.Value).ToArray();
Expand All @@ -396,7 +415,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
{
e.Data["row"] = line;

if (defOptions.SkipError == null || !defOptions.SkipError(e, m))
if (defOptions.SkipError == null || !defOptions.SkipError(e, csvLine))
throw new ParseCsvException(e);
}

Expand All @@ -413,7 +432,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
{
string str = sr.ReadToEnd();

var matches = regex.Matches(str).Cast<Match>();
var matches = valueRegex.Matches(str).Cast<Match>();

int line = 0;
foreach (var m in matches)
Expand All @@ -429,7 +448,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
{
e.Data["row"] = line;

if (defOptions.SkipError == null || !defOptions.SkipError(e, m))
if (defOptions.SkipError == null || !defOptions.SkipError(e, m.Value))
throw new ParseCsvException(e);
}
if (t != null)
Expand Down Expand Up @@ -530,16 +549,17 @@ public class MyFileCSV
""";
}


static ConcurrentDictionary<char, Regex> regexCache = new ConcurrentDictionary<char, Regex>();
const string BaseRegex = @"^((?<val>'(?:[^']+|'')*'|[^;\r\n]*))?((?!($|\r\n));(?<val>'(?:[^']+|'')*'|[^;\r\n]*))*($|\r\n)";
static Regex GetRegex(CultureInfo culture, TimeSpan timeout, char? listSeparator = null)
static ConcurrentDictionary<(bool multiLine, char separator, TimeSpan timeout), Regex> regexCache = new();
readonly static string ValueRegex = "'(?:[^']+|'')*'|[^;\r\n]*".Replace('\'', '"');
readonly static string LineRegex = $@"^({ValueRegex})?((?!($|\r\n));({ValueRegex}))*($|\r\n)";
static Regex GetRegex(bool isLine, CultureInfo culture, TimeSpan timeout, char? listSeparator = null)
{
char separator = listSeparator ?? GetListSeparator(culture);

return regexCache.GetOrAdd(separator, s =>
new Regex(BaseRegex.Replace('\'', '"').Replace(';', s), RegexOptions.Multiline | RegexOptions.ExplicitCapture, timeout));
return regexCache.GetOrAdd((isLine, separator, timeout), a =>
new Regex((isLine ? LineRegex : ValueRegex).Replace(';', a.separator), RegexOptions.Multiline | RegexOptions.ExplicitCapture, a.timeout));
}


private static char GetListSeparator(CultureInfo culture)
{
Expand Down Expand Up @@ -570,62 +590,60 @@ static CsvMemberCache()
public static List<CsvMemberInfo<T>> Members;
}

static string DecodeCsv(string s)


static ReadOnlySpan<char> DecodeCsv(ReadOnlySpan<char> s)
{
if (s.StartsWith("\"") && s.EndsWith("\""))
{
string str = s[1..^1].Replace("\"\"", "\"");
string str = new string(s[1..^1]).Replace("\"\"", "\"");

return Regex.Replace(str, "(?<!\r)\n", "\r\n");
}

return s;
}

static object? ConvertTo(string s, Type type, CultureInfo culture, string? format)
static ValueParser GetBasicParser(Type type, CultureInfo culture, string? format)
{
Type? baseType = Nullable.GetUnderlyingType(type);
if (baseType != null)
return type switch
{
if (!s.HasText())
return null;

type = baseType;
}

if (type.IsEnum)
return Enum.Parse(type, s);

if (type == typeof(DateTime))
if (format == null)
return DateTime.Parse(s, culture);
else
return DateTime.ParseExact(s, format, culture);

if (type == typeof(DateOnly))
if (format == null)
return DateOnly.Parse(s, culture);
else
return DateOnly.ParseExact(s, format, culture);

if (type == typeof(Guid))
return Guid.Parse(s);

return Convert.ChangeType(s, type, culture);
_ when type == typeof(string) => str => str.Length == 0 ? null : str.ToString(),
_ when type == typeof(byte) => str => str.Length == 0 ? null : byte.Parse(str, NumberStyles.Integer, culture),
_ when type == typeof(sbyte) => str => str.Length == 0 ? null : sbyte.Parse(str, NumberStyles.Integer, culture),
_ when type == typeof(short) => str => str.Length == 0 ? null : short.Parse(str, NumberStyles.Integer, culture),
_ when type == typeof(ushort) => str => str.Length == 0 ? null : ushort.Parse(str, NumberStyles.Integer, culture),
_ when type == typeof(int) => str => str.Length == 0 ? null : int.Parse(str, NumberStyles.Integer, culture),
_ when type == typeof(uint) => str => str.Length == 0 ? null : uint.Parse(str, NumberStyles.Integer, culture),
_ when type == typeof(long) => str => str.Length == 0 ? null : long.Parse(str, NumberStyles.Integer, culture),
_ when type == typeof(ulong) => str => str.Length == 0 ? null : ulong.Parse(str, NumberStyles.Integer, culture),
_ when type == typeof(float) => str => str.Length == 0 ? null : float.Parse(str, NumberStyles.Float, culture),
_ when type == typeof(double) => str => str.Length == 0 ? null : double.Parse(str, NumberStyles.Float, culture),
_ when type == typeof(decimal) => str => str.Length == 0 ? null : decimal.Parse(str, NumberStyles.Number, culture),
_ when type == typeof(DateTime) => str => str.Length == 0 ? null : DateTime.ParseExact(str, format, culture),
_ when type == typeof(DateTimeOffset) => str => str.Length == 0 ? null : DateTimeOffset.ParseExact(str, format, culture),
_ when type == typeof(DateOnly) => str => str.Length == 0 ? null : DateOnly.ParseExact(str, format, culture),
_ when type == typeof(TimeOnly) => str => str.Length == 0 ? null : TimeOnly.ParseExact(str, format, culture),
_ when type == typeof(Guid) => str => str.Length == 0 ? null : Guid.Parse(str.ToString()),
_ when type.IsEnum => str => str.Length == 0 ? null : Enum.Parse(type, str),
_ => str => Convert.ChangeType(new string(str), type, culture)
};
}
}

public class CsvReadOptions<T> : CsvReadOptions
where T : class
{
public Func<CsvMemberInfo<T>, CultureInfo, Func<string, object?>?>? ParserFactory;
public Func<Match, T>? Constructor;
public Func<CsvMemberInfo<T>, CultureInfo, Csv.ValueParser?>? ParserFactory;
public CsvConstructor<T>? Constructor;
}

public delegate T CsvConstructor<T>(ReadOnlySpan<char> line);

public class CsvReadOptions
{
public bool AsumeSingleLine = false;
public Func<Exception, Match?, bool>? SkipError;
public bool AsumeSingleLine = true; //Breaking change!
public Func<Exception, string, bool>? SkipError;
public TimeSpan RegexTimeout = Regex.InfiniteMatchTimeout;
public char? ListSeparator;
}
Expand Down

1 comment on commit e17aeb8

@olmobrutall
Copy link
Collaborator Author

@olmobrutall olmobrutall commented on e17aeb8 Dec 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CSV Performance Improvements?

A few months ago I hade a big 5gb CSV so I went ahead and made some changes in the way the CSV parser works, instead of using Regex and string was using a custom parser based on Span<T>.

All my intuitions made me think this code should be faster. So I proudly commited and pushed sure of the new insine performance.

Now, a few months later I wanted to brag about the performance improvements so I took the time to measure the performance of the old and new version

Old Code:  (Regex, string)
00:00:18.1489790
00:00:18.0134255
00:00:18.1483431

New Code: (Custom reader, Span<T>)
00:01:05.8259662
00:01:02.2113966

Oh shit... the new code is 3x times slower. How it is possible? I was reducing allocations and all...

No idea, if someone wants to take a a deeper look go for it.... for me I'm going to revert to the old code, with the only exception of malking AssumeSingleLine = true the default.

The moral of the story: Measure your code! Every time I open the heavy profiler I find something new

Please sign in to comment.