Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for string interning #144

Merged
merged 1 commit into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp.CodeFix.Testing.XUnit" Version="1.1.2" />
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp.Workspaces" Version="$(RoslynVersion)" />
<PackageVersion Include="Microsoft.CodeAnalysis.PublicApiAnalyzers" Version="3.3.4" />
<PackageVersion Include="Microsoft.NET.StringTools" Version="17.12.6" />
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
<PackageVersion Include="Microsoft.VisualStudio.Validation" Version="17.8.8" />
<PackageVersion Include="Nerdbank.Streams" Version="2.11.79" />
Expand Down
68 changes: 68 additions & 0 deletions src/Nerdbank.MessagePack/Converters/PrimitiveConverters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using System.Text;
using System.Text.Json.Nodes;
using Microsoft;
using Strings = Microsoft.NET.StringTools.Strings;

namespace Nerdbank.MessagePack.Converters;

Expand All @@ -27,6 +28,73 @@ internal class StringConverter : MessagePackConverter<string>
public override JsonObject? GetJsonSchema(JsonSchemaContext context, ITypeShape typeShape) => new() { ["type"] = "string" };
}

/// <summary>
/// Serializes a <see cref="string"/> and interns them during deserialization.
/// </summary>
internal class InterningStringConverter : MessagePackConverter<string>
{
// The actual stack space taken will be up to 2X this value, because we're converting UTF-8 to UTF-16.
private const int MaxStackStringCharLength = 4096;

/// <inheritdoc/>
public override string? Read(ref MessagePackReader reader, SerializationContext context)
{
if (reader.TryReadNil())
{
return null;
}

ReadOnlySequence<byte> bytesSequence = default;
bool spanMode;
int byteLength;
if (reader.TryReadStringSpan(out ReadOnlySpan<byte> byteSpan))
{
if (byteSpan.IsEmpty)
{
return string.Empty;
}

spanMode = true;
byteLength = byteSpan.Length;
}
else
{
bytesSequence = reader.ReadStringSequence()!.Value;
spanMode = false;
byteLength = checked((int)bytesSequence.Length);
}

char[]? charArray = byteLength > MaxStackStringCharLength ? ArrayPool<char>.Shared.Rent(byteLength) : null;
try
{
Span<char> stackSpan = charArray ?? stackalloc char[byteLength];
if (spanMode)
{
int characterCount = StringEncoding.UTF8.GetChars(byteSpan, stackSpan);
return Strings.WeakIntern(stackSpan[..characterCount]);
}
else
{
int characterCount = StringEncoding.UTF8.GetChars(bytesSequence, stackSpan);
return Strings.WeakIntern(stackSpan[..characterCount]);
}
}
finally
{
if (charArray is not null)
{
ArrayPool<char>.Shared.Return(charArray);
}
}
}

/// <inheritdoc/>
public override void Write(ref MessagePackWriter writer, in string? value, SerializationContext context) => writer.Write(value);

/// <inheritdoc/>
public override JsonObject? GetJsonSchema(JsonSchemaContext context, ITypeShape typeShape) => new() { ["type"] = "string" };
}

/// <summary>
/// Serializes a <see cref="bool"/>.
/// </summary>
Expand Down
32 changes: 32 additions & 0 deletions src/Nerdbank.MessagePack/MessagePackSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,38 @@ public bool PreserveReferences
}
}

/// <summary>
/// Gets a value indicating whether to intern strings during deserialization.
/// </summary>
/// <remarks>
/// <para>
/// String interning means that a string that appears multiple times (within a single deserialization or across many)
/// in the msgpack data will be deserialized as the same <see cref="string"/> instance, reducing GC pressure.
/// </para>
/// <para>
/// When enabled, all deserialized are retained with a weak reference, allowing them to be garbage collected
/// while also being reusable for future deserializations as long as they are in memory.
/// </para>
/// <para>
/// This feature has a positive impact on memory usage but may have a negative impact on performance due to searching
/// through previously deserialized strings to find a match.
/// If your application is performance sensitive, you should measure the impact of this feature on your application.
/// </para>
/// <para>
/// This feature is orthogonal and complementary to <see cref="PreserveReferences"/>.
/// Preserving references impacts the serialized result and can hurt interoperability if the other party is not using the same feature.
/// Preserving references also does not guarantee that equal strings will be reused because the original serialization may have had
/// multiple string objects for the same value, so deserialization would produce the same result.
/// Preserving references alone will never reuse strings across top-level deserialization operations either.
/// Interning strings however, has no impact on the serialized result and is always safe to use.
/// Interning strings will guarantee string objects are reused within and across deserialization operations so long as their values are equal.
/// The combination of the two features will ensure the most compact msgpack, and will produce faster deserialization times than string interning alone.
/// Combining the two features also activates special behavior to ensure that serialization only writes a string once
/// and references that string later in that same serialization, even if the equal strings were unique objects.
/// </para>
/// </remarks>
public bool InternStrings { get; init; }

/// <summary>
/// Gets the extension type codes to use for library-reserved extension types.
/// </summary>
Expand Down
1 change: 1 addition & 0 deletions src/Nerdbank.MessagePack/Nerdbank.MessagePack.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.CodeAnalysis.PublicApiAnalyzers" PrivateAssets="all" />
<PackageReference Include="Microsoft.NET.StringTools" />
<PackageReference Include="Microsoft.VisualStudio.Validation" PrivateAssets="compile" />
<PackageReference Include="PolyType" />
<PackageReference Include="System.IO.Pipelines" />
Expand Down
2 changes: 2 additions & 0 deletions src/Nerdbank.MessagePack/PublicAPI.Unshipped.txt
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ Nerdbank.MessagePack.MessagePackSerializer.DisposableSerializationContext.Value.
Nerdbank.MessagePack.MessagePackSerializer.GetJsonSchema(PolyType.Abstractions.ITypeShape! typeShape) -> System.Text.Json.Nodes.JsonObject!
Nerdbank.MessagePack.MessagePackSerializer.GetJsonSchema<T, TProvider>() -> System.Text.Json.Nodes.JsonObject!
Nerdbank.MessagePack.MessagePackSerializer.GetJsonSchema<T>() -> System.Text.Json.Nodes.JsonObject!
Nerdbank.MessagePack.MessagePackSerializer.InternStrings.get -> bool
Nerdbank.MessagePack.MessagePackSerializer.InternStrings.init -> void
Nerdbank.MessagePack.MessagePackSerializer.LibraryExtensionTypeCodes.get -> Nerdbank.MessagePack.LibraryReservedMessagePackExtensionTypeCode!
Nerdbank.MessagePack.MessagePackSerializer.LibraryExtensionTypeCodes.init -> void
Nerdbank.MessagePack.MessagePackSerializer.MultiDimensionalArrayFormat.get -> Nerdbank.MessagePack.MultiDimensionalArrayFormat
Expand Down
6 changes: 6 additions & 0 deletions src/Nerdbank.MessagePack/ReferenceEqualityTracker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using Microsoft;
using Microsoft.NET.StringTools;

namespace Nerdbank.MessagePack;

Expand Down Expand Up @@ -40,6 +41,11 @@ internal void WriteObject<T>(ref MessagePackWriter writer, T value, MessagePackC
Requires.NotNullAllowStructs(value);
Verify.Operation(this.Owner is not null, $"{nameof(this.Owner)} must be set before use.");

if (this.Owner.InternStrings && value is string)
{
value = (T)(object)Strings.WeakIntern((string)(object)value);
}

if (this.serializedObjects.TryGetValue(value, out int referenceId))
{
// This object has already been written. Skip it this time.
Expand Down
8 changes: 8 additions & 0 deletions src/Nerdbank.MessagePack/StandardVisitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ internal class StandardVisitor : TypeShapeVisitor, ITypeShapeFunc
pair => pair.Key,
pair => (object)((IMessagePackConverter)pair.Value).WrapWithReferencePreservation());

private static readonly InterningStringConverter InterningStringConverter = new();
private static readonly MessagePackConverter<string> ReferencePreservingInterningStringConverter = InterningStringConverter.WrapWithReferencePreservation();

private readonly MessagePackSerializer owner;
private readonly TypeGenerationContext context;

Expand Down Expand Up @@ -89,6 +92,11 @@ internal StandardVisitor(MessagePackSerializer owner, TypeGenerationContext cont
return userDefinedConverter;
}

if (this.owner.InternStrings && typeof(T) == typeof(string))
{
return this.owner.PreserveReferences ? ReferencePreservingInterningStringConverter : InterningStringConverter;
}

// Check if the type has a built-in converter.
FrozenDictionary<Type, object> builtins = this.owner.PreserveReferences ? PrimitiveReferencePreservingConverters : PrimitiveConverters;
if (builtins.TryGetValue(typeof(T), out object? defaultConverter))
Expand Down
19 changes: 19 additions & 0 deletions test/Benchmarks/StringInterning.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Andrew Arnott. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

[MemoryDiagnoser]
public partial class StringInterning
{
private static readonly MessagePackSerializer NonInterning = new() { InternStrings = false };
private static readonly MessagePackSerializer Interning = new() { InternStrings = true };
private static readonly byte[] StringArrayMsgPack = NonInterning.Serialize<string[], Witness>(["Hello, World!", "Hello, World!"]);

[Benchmark(Baseline = true)]
public void NonInterning_StringArray() => NonInterning.Deserialize<string[], Witness>(StringArrayMsgPack);

[Benchmark]
public void Interning_StringArray() => Interning.Deserialize<string[], Witness>(StringArrayMsgPack);

[GenerateShape<string[]>]
private partial class Witness;
}
24 changes: 24 additions & 0 deletions test/Nerdbank.MessagePack.Tests/ReferencePreservationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,29 @@ public void StringReferencePreservation()
Assert.Same(deserializedRoot.City, deserializedRoot.State);
}

[Theory, PairwiseData]
public void ReferenceConsolidationWhenInterningIsOn(bool interning)
{
this.Serializer = this.Serializer with { InternStrings = interning };

// Create two unique string objects with the same value.
string city = "New York";
string city2 = (city + "A")[..^1]; // construct a new instance with the same value.
Assert.NotSame(city, city2); // sanity check

string[]? deserialized = this.Roundtrip<string[], Witness>([city, city2]);
Assert.NotNull(deserialized);

// We expect equal string references after deserialization iff interning is on.
Assert.Equal(interning, ReferenceEquals(deserialized[0], deserialized[1]));

// Only interning should produce an object reference in the serialized form.
MessagePackReader reader = new(this.lastRoundtrippedMsgpack);
Assert.Equal(2, reader.ReadArrayHeader());
reader.ReadString(); // city
Assert.Equal(interning ? MessagePackType.Extension : MessagePackType.String, reader.NextMessagePackType);
}

/// <summary>
/// Verifies that two distinct object whose by-value equality is considered equal are <em>combined</em> into just one reference.
/// </summary>
Expand Down Expand Up @@ -261,5 +284,6 @@ public override void Write(ref MessagePackWriter writer, in CustomType2? value,

[GenerateShape<CustomTypeWrapper[]>]
[GenerateShape<CustomType[]>]
[GenerateShape<string[]>]
private partial class Witness;
}
61 changes: 61 additions & 0 deletions test/Nerdbank.MessagePack.Tests/StringInterningTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright (c) Andrew Arnott. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

public partial class StringInterningTests : MessagePackSerializerTestBase
{
public StringInterningTests(ITestOutputHelper logger)
: base(logger)
{
this.Serializer = this.Serializer with { InternStrings = true };
}

[Fact]
public void InternStringsDefault() => Assert.False(new MessagePackSerializer().InternStrings);

[Fact]
public void NoInterning()
{
this.Serializer = this.Serializer with { InternStrings = false };
string[]? deserialized = this.Roundtrip<string[], Witness>(["a", "a"]);
Assert.NotNull(deserialized);
Assert.NotSame(deserialized[0], deserialized[1]);
}

[Fact]
public void Interning()
{
this.Serializer = this.Serializer with { InternStrings = true };
string[]? deserialized = this.Roundtrip<string[], Witness>(["a", "a"]);
Assert.NotNull(deserialized);
Assert.Same(deserialized[0], deserialized[1]);

// Do it again, across deserializations.
string[]? deserialized2 = this.Roundtrip<string[], Witness>(["a", "a"]);
Assert.NotNull(deserialized2);
Assert.Same(deserialized[0], deserialized2[0]);
}

[Fact]
public void Null() => this.Roundtrip<string, Witness>(null);

[Fact]
public void Empty() => this.Roundtrip<string, Witness>(string.Empty);

[Fact]
public void VeryLargeString() => this.Roundtrip<string, Witness>(new string('a', 100_000));

[Fact]
public void Fragmented()
{
ReadOnlyMemory<byte> buffer = this.Serializer.Serialize<string, Witness>("abc");
Sequence<byte> seq = new();
seq.Append(buffer[..^1]);
seq.Append(buffer[^1..]);
string? deserialized = this.Serializer.Deserialize<string, Witness>(seq);
Assert.Equal("abc", deserialized);
}

[GenerateShape<string>]
[GenerateShape<string[]>]
private partial class Witness;
}