naitoh · theirix · Mar 25, 2026
diff --git a/lib/rbpdf.rb b/lib/rbpdf.rb
@@ -8246,6 +8246,17 @@ def out(s)
   #
   def UTF8StringToArray(str)
     if @cache_utf8_string_to_array[str]
+      # On a cache hit the byte-parsing loop is skipped, so @current_font['subsetchars']
+      # would never learn about these codepoints for the current font.  Without this
+      # update a second font that encounters the same string gets an empty glyph subset
+      # and those characters are stripped from the embedded font, rendering as invisible
+      # boxes in the PDF viewer even though the bytes are present in the content stream.
+      if @is_unicode && @current_font['subsetchars']
+        @cache_utf8_string_to_array[str].each do |unichar|
+          @current_font['subsetchars'][unichar] = true
+        end
+        setFontSubBuffer(@current_font['fontkey'], 'subsetchars', @current_font['subsetchars'])
+      end
       # return cached value
       return @cache_utf8_string_to_array[str].dup
     end

diff --git a/test/rbpdf_subsetchars_cache_test.rb b/test/rbpdf_subsetchars_cache_test.rb
@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+
+# Copyright (c) 2011-2024 NAITOH Jun
+# Released under the MIT license
+# http://www.opensource.org/licenses/MIT
+
+require "test_helper"
+
+# Regression test for the UTF8StringToArray cache bug that causes Cyrillic
+# characters to be invisible in the PDF when the same string is rendered
+# under more than one font.
+#
+# UTF8StringToArray keeps an LRU cache keyed on the string value.  On a cache
+# hit it returns immediately, skipping the byte-parsing loop that records each
+# codepoint in @current_font['subsetchars'].  When the same string is later
+# processed under a different font, that font's subsetchars is never updated,
+# so getTrueTypeFontSubset strips those glyphs from the embedded font data and
+# the characters render as invisible boxes in the PDF viewer.
+class RbpdfSubsetcharsCacheTest < Test::Unit::TestCase
+  class MYPDF < RBPDF
+    def getFontBuffer(font)
+      super
+    end
+
+    def call_utf8_string_to_array(str)
+      UTF8StringToArray(str)
+    end
+  end
+
+  CYRILLIC_TEXT = "Красного строителя"
+
+  # Codepoints above U+00FF must be added explicitly to subsetchars during
+  # rendering; subsetchars is only pre-populated for indices 0-255.
+  CYRILLIC_CODEPOINTS =
+    CYRILLIC_TEXT.each_char
+      .map(&:ord)
+      .select { |cp| cp > 255 }
+      .uniq
+      .sort
+      .freeze
+
+  test "UTF8StringToArray cache hit must update subsetchars for the current font" do
+    pdf = MYPDF.new
+    pdf.set_font_subsetting(true)
+    pdf.add_page
+
+    # ASCII-8BIT is the encoding UTF8StringToArray receives in practice.
+    str = CYRILLIC_TEXT.dup.force_encoding("ASCII-8BIT")
+
+    # Step 1: cache miss under freesansB — full parse runs, subsetchars populated.
+    pdf.set_font("freesans", "B", 12)
+    pdf.call_utf8_string_to_array(str)
+
+    bold_subsetchars = pdf.getFontBuffer("freesansB")["subsetchars"]
+    CYRILLIC_CODEPOINTS.each do |cp|
+      assert bold_subsetchars[cp],
+        "freesansB subsetchars missing U+#{cp.to_s(16).upcase.rjust(4, "0")} after cache-miss call"
+    end
+
+    # Step 2: cache hit under freesans — early return skips the subsetchars
+    # update, so freesans never learns about these codepoints (the bug).
+    pdf.set_font("freesans", "", 12)
+    pdf.call_utf8_string_to_array(str)
+
+    regular_subsetchars = pdf.getFontBuffer("freesans")["subsetchars"]
+    CYRILLIC_CODEPOINTS.each do |cp|
+      assert regular_subsetchars[cp],
+        "freesans subsetchars missing U+#{cp.to_s(16).upcase.rjust(4, "0")} after cache-hit call"
+    end
+  end
+end