Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PackFromRgbPlanes AVX2 vectorised implementation for Rgba32 and Rgba24 pixels #1242

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ internal static void ByteToNormalizedFloatReduce(

if (adjustedCount > 0)
{
ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
ByteToNormalizedFloat(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));

source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
Expand Down
184 changes: 184 additions & 0 deletions src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,190 @@ internal static void NormalizedFloatToByteSaturate(
}
}
}

internal static void PackBytesToUInt32SaturateChannel4Reduce(
ref ReadOnlySpan<byte> channel0,
ref ReadOnlySpan<byte> channel1,
ref ReadOnlySpan<byte> channel2,
ref Span<byte> dest)
{
DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");

if (Avx2.IsSupported)
{
int remainder = ImageMaths.ModuloP2(channel1.Length, Vector256<byte>.Count);
int adjustedCount = channel1.Length - remainder;

if (adjustedCount > 0)
{
channel0 = channel0.Slice(adjustedCount);
channel1 = channel1.Slice(adjustedCount);
channel2 = channel2.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);

PackBytesToUInt32SaturateChannel4(
channel0,
channel1,
channel2,
dest);
}
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void PackBytesToUInt32SaturateChannel4(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
int n = dest.Length / Vector256<byte>.Count;

ref Vector256<byte> source0Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel0));
ref Vector256<byte> source1Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel1));
ref Vector256<byte> source2Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel2));

ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));

Vector256<byte> allOnes = Avx2.CompareEqual(Vector256<byte>.Zero, Vector256<byte>.Zero);

for (int i = 0, j = 0; j < n; i += 1, j += 4)
{
Vector256<byte> s0 = Unsafe.Add(ref source0Base, i);
Vector256<byte> s1 = Unsafe.Add(ref source1Base, i);
Vector256<byte> s2 = Unsafe.Add(ref source2Base, i);

s0 = Avx2.Permute4x64(s0.AsUInt64(), 0b_11_01_10_00).AsByte();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm not missing something here we can pack these with less work in a manner similar to how it's done in the jpeg color converter?

internal static void ConvertCore(in ComponentValues values, Span<Vector4> result, float maxValue, float halfValue)

Copy link
Contributor

@saucecontrol saucecontrol Oct 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, it's the same problem. The unpack operations work in-lane, so whatever starts in the upper lane of the input will be in the upper lanes of the output. The current code re-permutes after each round to keep things in the right place throughout. The other options are:

  1. Deal with it at the end, by writing the 4 lower lanes to the output first followed by the 4 upper lanes, or permuting the 128-bit pairs together before writing.
  2. Rearrange the input so the blocks of 4 bytes (which will become blocks of 16 bytes) will come out paired correctly after the unpacks.

Option 2 will be cheaper since extract costs the same as a permute, and as with the YCbCr conversion, you can get by with only permuting 3 inputs for 4 outputs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thought so, thanks for confirming. 👍

s1 = Avx2.Permute4x64(s1.AsUInt64(), 0b_11_01_10_00).AsByte();
s2 = Avx2.Permute4x64(s2.AsUInt64(), 0b_11_01_10_00).AsByte();

Vector256<ushort> s01Lo = Avx2.UnpackLow(s0, s1).AsUInt16();
Vector256<ushort> s01Hi = Avx2.UnpackHigh(s0, s1).AsUInt16();

s01Lo = Avx2.Permute4x64(s01Lo.AsUInt64(), 0b_11_01_10_00).AsUInt16();
s01Hi = Avx2.Permute4x64(s01Hi.AsUInt64(), 0b_11_01_10_00).AsUInt16();

Vector256<ushort> s23Lo = Avx2.UnpackLow(s2, allOnes).AsUInt16();
Vector256<ushort> s23Hi = Avx2.UnpackHigh(s2, allOnes).AsUInt16();

s23Lo = Avx2.Permute4x64(s23Lo.AsUInt64(), 0b_11_01_10_00).AsUInt16();
s23Hi = Avx2.Permute4x64(s23Hi.AsUInt64(), 0b_11_01_10_00).AsUInt16();

Vector256<byte> b0 = Avx2.UnpackLow(s01Lo, s23Lo).AsByte();
Vector256<byte> b1 = Avx2.UnpackHigh(s01Lo, s23Lo).AsByte();
Vector256<byte> b2 = Avx2.UnpackLow(s01Hi, s23Hi).AsByte();
Vector256<byte> b3 = Avx2.UnpackHigh(s01Hi, s23Hi).AsByte();

Unsafe.Add(ref destBase, j) = b0;
Unsafe.Add(ref destBase, j + 1) = b1;
Unsafe.Add(ref destBase, j + 2) = b2;
Unsafe.Add(ref destBase, j + 3) = b3;
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void PackBytesToUInt24Reduce(
ref ReadOnlySpan<byte> channel0,
ref ReadOnlySpan<byte> channel1,
ref ReadOnlySpan<byte> channel2,
ref Span<byte> dest)
{
DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");

if (Avx2.IsSupported)
{
int remainder = ImageMaths.ModuloP2(channel0.Length, Vector256<byte>.Count);
int adjustedCount = channel0.Length - remainder;

if (adjustedCount > 0)
{
channel0 = channel0.Slice(adjustedCount);
channel1 = channel1.Slice(adjustedCount);
channel2 = channel2.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);

PackBytesToUInt24(
channel0,
channel1,
channel2,
dest);
}
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void PackBytesToUInt24(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
VerifySpanInput(channel0, dest, Vector256<byte>.Count);
VerifySpanInput(channel1, dest, Vector256<byte>.Count);
VerifySpanInput(channel2, dest, Vector256<byte>.Count);

int n = dest.Length / Vector256<byte>.Count;

ref Vector256<byte> source0Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel0));
ref Vector256<byte> source1Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel1));
ref Vector256<byte> source2Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel2));

ref Vector256<byte> destBase =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));

Vector256<byte> s0Mask0 = Vector256.Create(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1).AsByte();
Vector256<byte> s0Mask1 = Vector256.Create(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5).AsByte();
Vector256<byte> s0Mask2 = Vector256.Create(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1).AsByte();

Vector256<byte> s1Mask0 = Vector256.Create(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10).AsByte();
Vector256<byte> s1Mask1 = Vector256.Create(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1).AsByte();
Vector256<byte> s1Mask2 = Vector256.Create(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1).AsByte();

Vector256<byte> s2Mask0 = Vector256.Create(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1).AsByte();
Vector256<byte> s2Mask1 = Vector256.Create(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1).AsByte();
Vector256<byte> s2Mask2 = Vector256.Create(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15).AsByte();

for (int i = 0, j = 0; j < n; i += 1, j += 3)
{
Vector256<byte> s0 = Unsafe.Add(ref source0Base, i);
Vector256<byte> s1 = Unsafe.Add(ref source1Base, i);
Vector256<byte> s2 = Unsafe.Add(ref source2Base, i);

Vector256<byte> loS0 = Avx2.Permute2x128(s0, s0, 0);
Vector256<byte> loS1 = Avx2.Permute2x128(s1, s1, 0);
Vector256<byte> loS2 = Avx2.Permute2x128(s2, s2, 0);

Vector256<byte> b0 = Avx2.Shuffle(loS0, s0Mask0);
b0 = Avx2.Or(b0, Avx2.Shuffle(loS1, s1Mask0));
b0 = Avx2.Or(b0, Avx2.Shuffle(loS2, s2Mask0));

Vector256<byte> b1 = Avx2.Shuffle(s0, s0Mask1);
b1 = Avx2.Or(b1, Avx2.Shuffle(s1, s1Mask1));
b1 = Avx2.Or(b1, Avx2.Shuffle(s2, s2Mask1));

Vector256<byte> hiS0 = Avx2.Permute2x128(s0, s0, 0b_0001_0001);
Vector256<byte> hiS1 = Avx2.Permute2x128(s1, s1, 0b_0001_0001);
Vector256<byte> hiS2 = Avx2.Permute2x128(s2, s2, 0b_0001_0001);

Vector256<byte> b2 = Avx2.Shuffle(hiS0, s0Mask2);
b2 = Avx2.Or(b2, Avx2.Shuffle(hiS1, s1Mask2));
b2 = Avx2.Or(b2, Avx2.Shuffle(hiS2, s2Mask2));

Unsafe.Add(ref destBase, j + 0) = b0;
Unsafe.Add(ref destBase, j + 1) = b1;
Unsafe.Add(ref destBase, j + 2) = b2;
}
}
}
}
}
Expand Down
107 changes: 107 additions & 0 deletions src/ImageSharp/Common/Helpers/SimdUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,103 @@ internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, S
}
}

internal static void PackBytesToUInt32SaturateChannel4(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");

#if SUPPORTS_RUNTIME_INTRINSICS
HwIntrinsics.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);

// I can't immediately see a way to do this operation efficiently with Vector<T> or Vector4<T>. TODO
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is none :)

#elif SUPPORTS_EXTENDED_INTRINSICS
// ExtendedIntrinsics.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);
#else
// BasicIntrinsics256.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);
#endif

// Deal with the remainder:
if (channel0.Length > 0)
{
PackBytesToUInt32SaturateChannel4Remainder(channel0, channel1, channel2, dest);
}
}

private static void PackBytesToUInt32SaturateChannel4Remainder(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, channel0.Length * 4, nameof(dest));

ref byte s0Base = ref MemoryMarshal.GetReference(channel0);
ref byte s1Base = ref MemoryMarshal.GetReference(channel1);
ref byte s2Base = ref MemoryMarshal.GetReference(channel2);
ref byte dBase = ref MemoryMarshal.GetReference(dest);

for (int i = 0, j = 0; i < dest.Length; i += 1, j += 4)
{
Unsafe.Add(ref dBase, j) = Unsafe.Add(ref s0Base, i);
Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref s1Base, i);
Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref s2Base, i);
Unsafe.Add(ref dBase, j + 2) = 0xFF;
}
}

internal static void PackBytesToUInt24(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");

#if SUPPORTS_RUNTIME_INTRINSICS
HwIntrinsics.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);

// I can't immediately see a way to do this operation efficiently with Vector<T> or Vector4<T>. TODO
#elif SUPPORTS_EXTENDED_INTRINSICS
// ExtendedIntrinsics.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);
#else
// BasicIntrinsics256.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);
#endif

// Deal with the remainder:
if (channel0.Length > 0)
{
PackBytesToUInt24Remainder(channel0, channel1, channel2, dest);
}
}

private static void PackBytesToUInt24Remainder(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, channel0.Length * 3, nameof(dest));

ref byte s0Base = ref MemoryMarshal.GetReference(channel0);
ref byte s1Base = ref MemoryMarshal.GetReference(channel1);
ref byte s2Base = ref MemoryMarshal.GetReference(channel2);
ref byte dBase = ref MemoryMarshal.GetReference(dest);

for (int i = 0, j = 0; i < dest.Length; i += 1, j += 3)
{
Unsafe.Add(ref dBase, j) = Unsafe.Add(ref s0Base, i);
Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref s1Base, i);
Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref s2Base, i);
}
}

[MethodImpl(InliningOptions.ColdPath)]
private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest)
{
Expand Down Expand Up @@ -192,6 +289,16 @@ private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest,
$"length should be divisible by {shouldBeDivisibleBy}!");
}

[Conditional("DEBUG")]
private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<byte> dest, int shouldBeDivisibleBy)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
DebugGuard.IsTrue(
ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0,
nameof(source),
$"length should be divisible by {shouldBeDivisibleBy}!");
}

[Conditional("DEBUG")]
private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest, int shouldBeDivisibleBy)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

// <auto-generated />
Expand All @@ -21,7 +21,7 @@ public partial struct Rgb24
/// <summary>
/// Provides optimized overrides for bulk operations.
/// </summary>
internal class PixelOperations : PixelOperations<Rgb24>
internal partial class PixelOperations : PixelOperations<Rgb24>
{
/// <inheritdoc />
public override void FromRgb24(Configuration configuration, ReadOnlySpan<Rgb24> source, Span<Rgb24> destinationPixels)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<#@include file="_Common.ttinclude" #>
<#@include file="_Common.ttinclude" #>
<#@ output extension=".cs" #>

namespace SixLabors.ImageSharp.PixelFormats
Expand All @@ -11,7 +11,7 @@ namespace SixLabors.ImageSharp.PixelFormats
/// <summary>
/// Provides optimized overrides for bulk operations.
/// </summary>
internal class PixelOperations : PixelOperations<Rgb24>
internal partial class PixelOperations : PixelOperations<Rgb24>
{
<# GenerateAllDefaultConversionMethods("Rgb24"); #>
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using System.Runtime.InteropServices;

namespace SixLabors.ImageSharp.PixelFormats
{
/// <content>
/// Provides optimized overrides for bulk operations.
/// </content>
public partial struct Rgb24
{
/// <summary>
/// <see cref="PixelOperations{TPixel}"/> implementation optimized for <see cref="Rgb24"/>.
/// </summary>
internal partial class PixelOperations : PixelOperations<Rgb24>
{
/// <inheritdoc />
public override void PackFromRgbPlanes(
Configuration configuration,
ReadOnlySpan<byte> redChannel,
ReadOnlySpan<byte> greenChannel,
ReadOnlySpan<byte> blueChannel,
Span<Rgb24> destination)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.IsTrue(redChannel.Length == greenChannel.Length, nameof(redChannel), "Red channel must be same size as green channel");
Guard.IsTrue(greenChannel.Length == blueChannel.Length, nameof(greenChannel), "Green channel must be same size as blue channel");
Guard.DestinationShouldNotBeTooShort(redChannel, destination, nameof(destination));

destination = destination.Slice(0, redChannel.Length);

SimdUtils.PackBytesToUInt32SaturateChannel4(redChannel, greenChannel, blueChannel, MemoryMarshal.AsBytes(destination));
}
}
}
}
Loading