From 3e1949eed25ee3c0bafc03758afae7263e60cf14 Mon Sep 17 00:00:00 2001 From: "Mateusz \"Serafin\" Gajewski" Date: Wed, 5 Feb 2025 10:04:28 +0100 Subject: [PATCH] Add SliceUtf8.toTitleCase() --- src/main/java/io/airlift/slice/SliceUtf8.java | 50 +++++++++++++++++++ src/test/java/io/airlift/slice/TestSlice.java | 12 +++++ 2 files changed, 62 insertions(+) diff --git a/src/main/java/io/airlift/slice/SliceUtf8.java b/src/main/java/io/airlift/slice/SliceUtf8.java index 8ffe408b..dc64b744 100644 --- a/src/main/java/io/airlift/slice/SliceUtf8.java +++ b/src/main/java/io/airlift/slice/SliceUtf8.java @@ -299,6 +299,56 @@ public static Slice toLowerCase(Slice utf8) return translateCodePoints(utf8, LOWER_CODE_POINTS); } + public static Slice toTitleCase(Slice utf8) + { + int length = utf8.length(); + Slice newUtf8 = Slices.allocate(length); + + int position = 0; + int upperPosition = 0; + boolean upperNext = true; + while (position < length) { + int codePoint = tryGetCodePointAt(utf8, position); + if (codePoint >= 0) { + int upperCodePoint = LOWER_CODE_POINTS[codePoint]; + if (upperNext) { + upperCodePoint = UPPER_CODE_POINTS[codePoint]; + upperNext = false; + } + + if (WHITESPACE_CODE_POINTS[codePoint]) { + upperNext = true; + } + + // grow slice if necessary + int nextUpperPosition = upperPosition + lengthOfCodePoint(upperCodePoint); + if (nextUpperPosition > length) { + newUtf8 = Slices.ensureSize(newUtf8, nextUpperPosition); + } + + // write new byte + setCodePointAt(upperCodePoint, newUtf8, upperPosition); + + position += lengthOfCodePoint(codePoint); + upperPosition = nextUpperPosition; + } + else { + int skipLength = -codePoint; + + // grow slice if necessary + int nextUpperPosition = upperPosition + skipLength; + if (nextUpperPosition > length) { + newUtf8 = Slices.ensureSize(newUtf8, nextUpperPosition); + } + + copyUtf8SequenceUnsafe(utf8, position, newUtf8, upperPosition, skipLength); + position += skipLength; + upperPosition = nextUpperPosition; + } + } + return newUtf8.slice(0, upperPosition); + } + private static Slice translateCodePoints(Slice utf8, int[] codePointTranslationMap) { int length = utf8.length(); diff --git a/src/test/java/io/airlift/slice/TestSlice.java b/src/test/java/io/airlift/slice/TestSlice.java index e53ee939..14cc7ec6 100644 --- a/src/test/java/io/airlift/slice/TestSlice.java +++ b/src/test/java/io/airlift/slice/TestSlice.java @@ -30,6 +30,7 @@ import static io.airlift.slice.SizeOf.SIZE_OF_SHORT; import static io.airlift.slice.SizeOf.instanceSize; import static io.airlift.slice.SizeOf.sizeOfByteArray; +import static io.airlift.slice.SliceUtf8.toTitleCase; import static io.airlift.slice.Slices.EMPTY_SLICE; import static io.airlift.slice.Slices.utf8Slice; import static java.lang.Double.doubleToLongBits; @@ -196,6 +197,17 @@ public void testUtf8Conversion() assertThat(utf8Slice(s).toStringUtf8()).isEqualTo(s); } + @Test + public void testUtf8TitleCaseConversion() + { + String s = "apple \u2603 snowman"; + Slice slice = Slices.copiedBuffer(s, UTF_8); + + assertThat(toTitleCase(utf8Slice(s))).isEqualTo(toTitleCase(slice)); + assertThat(toTitleCase(slice).toStringUtf8()).isEqualTo("Apple \u2603 Snowman"); + assertThat(utf8Slice(s).toStringUtf8()).isEqualTo(s); + } + @SuppressWarnings("CharUsedInArithmeticContext") private static void assertToStrings(Slice slice, int index) {