Skip to content

Support string interning / deduplication within packets #11640

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 6, 2025
583 changes: 583 additions & 0 deletions src/Build.UnitTests/BackEnd/BinaryTranslator_Tests.cs

Large diffs are not rendered by default.

183 changes: 183 additions & 0 deletions src/Framework/BinaryTranslator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,29 @@ internal static ITranslator GetWriteTranslator(Stream stream)
/// </summary>
private class BinaryReadTranslator : ITranslator
{
/// <summary>
/// The intern reader used in an intern scope.
/// </summary>
private readonly InterningReadTranslator _interner;

/// <summary>
/// The binary reader used in read mode.
/// </summary>
private BinaryReader _reader;

/// <summary>
/// Whether the caller has entered an intern scope.
/// </summary>
private bool _isInterning;

#nullable enable
/// <summary>
/// Constructs a serializer from the specified stream, operating in the designated mode.
/// </summary>
public BinaryReadTranslator(Stream packetStream, BinaryReaderFactory buffer)
{
_reader = buffer.Create(packetStream);
_interner = new InterningReadTranslator(this);
}
#nullable disable

Expand Down Expand Up @@ -782,6 +793,80 @@ public bool TranslateNullable<T>(T value)
bool haveRef = _reader.ReadBoolean();
return haveRef;
}

public void WithInterning(IEqualityComparer<string> comparer, int initialCapacity, Action<ITranslator> internBlock)
{
if (_isInterning)
{
throw new InvalidOperationException("Cannot enter recursive intern block.");
}

_isInterning = true;

// Deserialize the intern header before entering the intern scope.
_interner.Translate(this);

// No other setup is needed since we can parse the packet directly from the stream.
internBlock(this);

_isInterning = false;
}

public void Intern(ref string str, bool nullable = true)
{
if (!_isInterning)
{
Translate(ref str);
return;
}

if (nullable && !TranslateNullable(string.Empty))
{
str = null;
return;
}

str = _interner.Read();
}

public void Intern(ref string[] array)
{
if (!_isInterning)
{
Translate(ref array);
return;
}

if (!TranslateNullable(array))
{
return;
}

int count = _reader.ReadInt32();
array = new string[count];

for (int i = 0; i < count; i++)
{
array[i] = _interner.Read();
}
}

public void InternPath(ref string str, bool nullable = true)
{
if (!_isInterning)
{
Translate(ref str);
return;
}

if (nullable && !TranslateNullable(string.Empty))
{
str = null;
return;
}

str = _interner.ReadPath();
}
}

/// <summary>
Expand All @@ -794,6 +879,18 @@ private class BinaryWriteTranslator : ITranslator
/// </summary>
private BinaryWriter _writer;

/// <summary>
/// The intern writer used in an intern scope.
/// This must be lazily instantiated since the interner has its own internal write translator, and
/// would otherwise go into a recursive loop on initalization.
/// </summary>
private InterningWriteTranslator _interner;

/// <summary>
/// Whether the caller has entered an intern scope.
/// </summary>
private bool _isInterning;

/// <summary>
/// Constructs a serializer from the specified stream, operating in the designated mode.
/// </summary>
Expand Down Expand Up @@ -1498,6 +1595,92 @@ public bool TranslateNullable<T>(T value)
_writer.Write(haveRef);
return haveRef;
}

public void WithInterning(IEqualityComparer<string> comparer, int initialCapacity, Action<ITranslator> internBlock)
{
if (_isInterning)
{
throw new InvalidOperationException("Cannot enter recursive intern block.");
}

// Every new scope requires the interner's state to be reset.
_interner ??= new InterningWriteTranslator();
_interner.Setup(comparer, initialCapacity);

// Temporaily swap our writer with the interner.
// This forwards all writes to this translator into the interning buffer, so that any non-interned
// writes which are interleaved will be in the correct order.
BinaryWriter streamWriter = _writer;
_writer = _interner.Writer;
_isInterning = true;

try
{
internBlock(this);
}
finally
{
_writer = streamWriter;
_isInterning = false;
}

// Write the interned buffer into the real output stream.
_interner.Translate(this);
}

public void Intern(ref string str, bool nullable = true)
{
if (!_isInterning)
{
Translate(ref str);
return;
}

if (nullable && !TranslateNullable(str))
{
return;
}

_interner.Intern(str);
}

public void Intern(ref string[] array)
{
if (!_isInterning)
{
Translate(ref array);
return;
}

if (!TranslateNullable(array))
{
return;
}

int count = array.Length;
Translate(ref count);

for (int i = 0; i < count; i++)
{
_interner.Intern(array[i]);
}
}

public void InternPath(ref string str, bool nullable = true)
{
if (!_isInterning)
{
Translate(ref str);
return;
}

if (nullable && !TranslateNullable(str))
{
return;
}

_interner.InternPath(str);
}
}
}
}
54 changes: 54 additions & 0 deletions src/Framework/ITranslator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -366,5 +366,59 @@ void TranslateDictionary<D, T>(ref D dictionary, ObjectTranslator<T> objectTrans
/// <typeparam name="T">The type of object to test.</typeparam>
/// <returns>True if the object should be written, false otherwise.</returns>
bool TranslateNullable<T>(T value);

/// <summary>
/// Creates a scope which activates string interning / deduplication for any Intern_xx method.
/// This should generally be called from the root level packet.
/// </summary>
/// <param name="comparer">The string comparer to use when populating the intern cache.</param>
/// <param name="initialCapacity">The initial capacity of the intern cache.</param>
/// <param name="internBlock">A delegate providing a translator, in which all Intern_xx calls will go through the intern cache.</param>
/// <remarks>
/// Packet interning is implemented via a header with an array of all interned strings, followed by the body in
/// which any interned / duplicated strings are replaced by their ID.
/// <see cref="TranslationDirection"/> modes have different ordering requirements, so it would not be
/// possible to implement direction-agnostic serialization via the Intern_xx methods alone:
/// - Write: Because we don't know the full list of strings ahead of time, we need to create a temporary buffer
/// for the packet body, which we can later offset when flushing to the real stream.
/// - Read: The intern header needs to be deserialized before the packet body, otherwise we won't know what
/// string each ID maps to.
/// This method abstracts these requirements to the caller, such that the underlying translator will
/// automatically handle the appropriate IO ordering when entering / exiting the delegate scope.
/// </remarks>
void WithInterning(IEqualityComparer<string> comparer, int initialCapacity, Action<ITranslator> internBlock);

/// <summary>
/// Interns the string if the translator is currently within an intern block.
/// Otherwise, this forwards to the regular Translate method.
/// </summary>
/// <param name="str">The value to be translated.</param>
/// <param name="nullable">
/// Whether to null check and translate the nullable marker.
/// Setting this to false can reduce packet sizes when interning large numbers of strings
/// which are validated to always be non-null, such as dictionary keys.
/// </param>
void Intern(ref string str, bool nullable = true);

/// <summary>
/// Interns each string in the array if the translator is currently within an intern block.
/// Otherwise, this forwards to the regular Translate method. To match behavior, all strings
/// assumed to be non-null.
/// </summary>
/// <param name="array">The array to be translated.</param>
void Intern(ref string[] array);

/// <summary>
/// Interns the string if the translator is currently within an intern block.
/// Otherwise, this forwards to the regular Translate method.
/// If the string is determined to be path-like, the path components will be interned separately.
/// </summary>
/// <param name="str">The value to be translated.</param>
/// <param name="nullable">
/// Whether to null check and translate the nullable marker.
/// Setting this to false can reduce packet sizes when interning large numbers of strings
/// which are validated to always be non-null, such as dictionary keys.
/// </param>
void InternPath(ref string str, bool nullable = true);
}
}
7 changes: 7 additions & 0 deletions src/Framework/InternPathIds.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

namespace Microsoft.Build.BackEnd
{
internal readonly record struct InternPathIds(int DirectoryId, int FileNameId);
}
84 changes: 84 additions & 0 deletions src/Framework/InterningReadTranslator.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;

namespace Microsoft.Build.BackEnd
{
/// <summary>
/// Reads strings form a translator which contains interned packets.
/// </summary>
/// <remarks>
/// This maintains a reusable lookup table to deserialize packets interned by <see cref="InterningWriteTranslator"/>.
/// On Translate, the intern header (aka the array of strings indexed by ID) is deserialized.
/// The caller can then forward reads to deserialize any interned values in the packet body.
/// </remarks>
internal sealed class InterningReadTranslator : ITranslatable
{
private readonly ITranslator _translator;

private List<string> _strings = [];

private Dictionary<InternPathIds, string> _pathIdsToString = [];

internal InterningReadTranslator(ITranslator translator)
{
if (translator.Mode != TranslationDirection.ReadFromStream)
{
throw new InvalidOperationException(
$"{nameof(InterningReadTranslator)} can only be used with {nameof(TranslationDirection.ReadFromStream)}.");
}

_translator = translator;
}

internal string? Read()
{
int key = -1;
_translator.Translate(ref key);
return _strings[key];
}

internal string? ReadPath()
{
// If the writer set a null marker, read this as a single string.
if (!_translator.TranslateNullable(string.Empty))
{
return Read();
}

int directoryKey = -1;
int fileNameKey = -1;
_translator.Translate(ref directoryKey);
_translator.Translate(ref fileNameKey);

InternPathIds pathIds = new(directoryKey, fileNameKey);

// Only concatenate paths the first time we encounter a pair.
if (_pathIdsToString.TryGetValue(pathIds, out string? path))
{
return path;
}

string directory = _strings[pathIds.DirectoryId];
string fileName = _strings[pathIds.FileNameId];
string str = string.Concat(directory, fileName);
_pathIdsToString.Add(pathIds, str);

return str;
}

public void Translate(ITranslator translator)
{
// Only deserialize the intern header since the caller will be reading directly from the stream.
_translator.Translate(ref _strings);
#if NET
_pathIdsToString.Clear();
_pathIdsToString.EnsureCapacity(_strings.Count);
#else
_pathIdsToString = new(_strings.Count);
#endif
}
}
}
Loading