Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions lib/rbpdf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8246,6 +8246,17 @@ def out(s)
#
def UTF8StringToArray(str)
if @cache_utf8_string_to_array[str]
# On a cache hit the byte-parsing loop is skipped, so @current_font['subsetchars']
# would never learn about these codepoints for the current font. Without this
# update a second font that encounters the same string gets an empty glyph subset
# and those characters are stripped from the embedded font, rendering as invisible
# boxes in the PDF viewer even though the bytes are present in the content stream.
if @is_unicode && @current_font['subsetchars']
@cache_utf8_string_to_array[str].each do |unichar|
@current_font['subsetchars'][unichar] = true
end
setFontSubBuffer(@current_font['fontkey'], 'subsetchars', @current_font['subsetchars'])
end
# return cached value
return @cache_utf8_string_to_array[str].dup
end
Expand Down
71 changes: 71 additions & 0 deletions test/rbpdf_subsetchars_cache_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# frozen_string_literal: true

# Copyright (c) 2011-2024 NAITOH Jun
# Released under the MIT license
# http://www.opensource.org/licenses/MIT

require "test_helper"

# Regression test for the UTF8StringToArray cache bug that causes Cyrillic
# characters to be invisible in the PDF when the same string is rendered
# under more than one font.
#
# UTF8StringToArray keeps an LRU cache keyed on the string value. On a cache
# hit it returns immediately, skipping the byte-parsing loop that records each
# codepoint in @current_font['subsetchars']. When the same string is later
# processed under a different font, that font's subsetchars is never updated,
# so getTrueTypeFontSubset strips those glyphs from the embedded font data and
# the characters render as invisible boxes in the PDF viewer.
class RbpdfSubsetcharsCacheTest < Test::Unit::TestCase
class MYPDF < RBPDF
def getFontBuffer(font)
super
end

def call_utf8_string_to_array(str)
UTF8StringToArray(str)
end
end

CYRILLIC_TEXT = "Красного строителя"

# Codepoints above U+00FF must be added explicitly to subsetchars during
# rendering; subsetchars is only pre-populated for indices 0-255.
CYRILLIC_CODEPOINTS =
CYRILLIC_TEXT.each_char
.map(&:ord)
.select { |cp| cp > 255 }
.uniq
.sort
.freeze

test "UTF8StringToArray cache hit must update subsetchars for the current font" do
pdf = MYPDF.new
pdf.set_font_subsetting(true)
pdf.add_page

# ASCII-8BIT is the encoding UTF8StringToArray receives in practice.
str = CYRILLIC_TEXT.dup.force_encoding("ASCII-8BIT")

# Step 1: cache miss under freesansB — full parse runs, subsetchars populated.
pdf.set_font("freesans", "B", 12)
pdf.call_utf8_string_to_array(str)

bold_subsetchars = pdf.getFontBuffer("freesansB")["subsetchars"]
CYRILLIC_CODEPOINTS.each do |cp|
assert bold_subsetchars[cp],
"freesansB subsetchars missing U+#{cp.to_s(16).upcase.rjust(4, "0")} after cache-miss call"
end

# Step 2: cache hit under freesans — early return skips the subsetchars
# update, so freesans never learns about these codepoints (the bug).
pdf.set_font("freesans", "", 12)
pdf.call_utf8_string_to_array(str)

regular_subsetchars = pdf.getFontBuffer("freesans")["subsetchars"]
CYRILLIC_CODEPOINTS.each do |cp|
assert regular_subsetchars[cp],
"freesans subsetchars missing U+#{cp.to_s(16).upcase.rjust(4, "0")} after cache-hit call"
end
end
end