diff --git a/lib/pdf/reader.rb b/lib/pdf/reader.rb index 96bef30e..abd631b7 100644 --- a/lib/pdf/reader.rb +++ b/lib/pdf/reader.rb @@ -284,6 +284,7 @@ def root require 'pdf/reader/bounding_rectangle_runs_filter' require 'pdf/reader/cid_widths' require 'pdf/reader/cmap' +require 'pdf/reader/disjoint_set' require 'pdf/reader/encoding' require 'pdf/reader/error' require 'pdf/reader/filter' @@ -303,6 +304,7 @@ def root require 'pdf/reader/object_hash' require 'pdf/reader/object_stream' require 'pdf/reader/pages_strategy' +require 'pdf/reader/paragraph' require 'pdf/reader/parser' require 'pdf/reader/point' require 'pdf/reader/print_receiver' diff --git a/lib/pdf/reader/disjoint_set.rb b/lib/pdf/reader/disjoint_set.rb new file mode 100644 index 00000000..ba0d8ea4 --- /dev/null +++ b/lib/pdf/reader/disjoint_set.rb @@ -0,0 +1,73 @@ +# coding: utf-8 +# typed: strict +# frozen_string_literal: true + +module PDF + class Reader + + # In computer science, a disjoint-set data structure, also called a union–find data structure or merge–find set, + # is a data structure that stores a collection of disjoint (non-overlapping) sets. + class DisjointSet + include Enumerable + + def initialize + @parents = {} + @ranks = {} + end + + def contains(item) + @parents.key?(item) + end + + def each(&block) + return enum_for(:each) unless block_given? + + @parents.each_key(&block) + end + + def length + @parents.length + end + + def add(x) + @parents[x] = x + @ranks[x] = 0 + self + end + + def find(x) + return x if @parents[x] == x + + find(@parents[x]) + end + + def sets + cluster_parents = {} + @parents.each_key do |x| + p = find(x) + cluster_parents[p] = [] unless cluster_parents.key?(p) + cluster_parents[p].push(x) + end + cluster_parents.values + end + + def union(x, y) + x_parent = find(x) + y_parent = find(y) + + return self if x_parent == y_parent + + if @ranks[x_parent] > @ranks[y_parent] + @parents[y_parent] = x_parent + elsif @ranks[y_parent] > @ranks[x_parent] + @parents[x_parent] = y_parent + else + @parents[y_parent] = x_parent + @ranks[x_parent] += 1 + end + + self + end + end + end +end diff --git a/lib/pdf/reader/page.rb b/lib/pdf/reader/page.rb index d48768ad..23a16553 100644 --- a/lib/pdf/reader/page.rb +++ b/lib/pdf/reader/page.rb @@ -231,6 +231,45 @@ def rectangles } end + # returns all text on the page as an array of Paragraphs. + def paragraphs(opts = {}) + minimum_horizontal_overlap_percentage = opts.fetch(:minimum_horizontal_overlap_percentage, 0.80) + maximum_multiplied_leading = opts.fetch(:maximum_multiplied_leading, 1.40) + maximum_allowed_font_difference = opts.fetch(:maximum_allowed_font_difference, 1.00) + + disjoint_set = PDF::Reader::DisjointSet.new + runs(opts).each { |run| disjoint_set.add(run) } + + # Build disjoint set in order to find all text runs that "overlap" by a + # certain percentage, so we can combine the right runs together. + disjoint_set.each do |l0| + disjoint_set.each do |l1| + next if l0 == l1 + next if disjoint_set.find(l0) == disjoint_set.find(l1) + + overlap_percentage = l0.horizontal_overlap(l1) + leading = (l0.y - l1.y).abs / [l0.font_size, l1.font_size].min + + next unless overlap_percentage >= minimum_horizontal_overlap_percentage + next unless leading <= maximum_multiplied_leading + next if (l0.font_size - l1.font_size).abs > maximum_allowed_font_difference + + disjoint_set.union(l0, l1) + end + end + + paragraphs = disjoint_set.sets.map do |set| + # remember, pdf page origin is bottom left corner + leftmost_x = set.map(&:x).min + topmost_y = set.map(&:y).max + text = set.map { |run| run.text.strip }.join(' ') + + PDF::Reader::Paragraph.new(text, PDF::Reader::Point.new(leftmost_x, topmost_y)) + end + + paragraphs.map(&:text) + end + private def root diff --git a/lib/pdf/reader/paragraph.rb b/lib/pdf/reader/paragraph.rb new file mode 100644 index 00000000..c27aa250 --- /dev/null +++ b/lib/pdf/reader/paragraph.rb @@ -0,0 +1,18 @@ +# coding: utf-8 +# typed: true +# frozen_string_literal: true + +module PDF + class Reader + + # A simple class used by PDF::Reader::Page.paragraphs to represent a paragraph of text and its origin. + class Paragraph + attr_reader :text, :origin + + def initialize(text, origin) + @text = text + @origin = origin + end + end + end +end diff --git a/lib/pdf/reader/text_run.rb b/lib/pdf/reader/text_run.rb index 9daa0b44..12bfa814 100644 --- a/lib/pdf/reader/text_run.rb +++ b/lib/pdf/reader/text_run.rb @@ -91,6 +91,17 @@ def intersection_area_percent(other_run) intersection_area.to_f / area end + # return what percentage of this text run is overlapped by another run horizontally + def horizontal_overlap(other_run) + # rectangles do not overlap (we are on the left side) + return 0 if [x, endx].max < [other_run.x, other_run.endx].min + # rectangles do not overlap (other_run is on the left side) + return 0 if [other_run.x, other_run.endx].max < [x, endx].min + a = [ [x, endx].min, [other_run.x, other_run.endx].min ].max + b = [ [x, endx].max, [other_run.x, other_run.endx].max ].min + return (a - b).abs + end + private def area diff --git a/rbi/pdf-reader.rbi b/rbi/pdf-reader.rbi index 07233fc1..b0fa7356 100644 --- a/rbi/pdf-reader.rbi +++ b/rbi/pdf-reader.rbi @@ -205,6 +205,38 @@ module PDF def bfrange_type_two(start_code, end_code, dst); end end + class DisjointSet + include Enumerable + Elem = type_member { {fixed: T.untyped} } + + sig { void } + def initialize + @parents = T.let({}, T::Hash[T.anything, T.untyped]) + @ranks = T.let({}, T::Hash[T.anything, T.untyped]) + end + + sig { params(item: T.anything).returns(T::Boolean) } + def contains(item); end + + sig { override.params(block: T.nilable).returns(T.any(T::Hash[T.untyped, T.untyped], T::Enumerator[T.untyped])) } + def each(&block); end + + sig { returns(Integer) } + def length; end + + sig { params(x: T.untyped).returns(PDF::Reader::DisjointSet) } + def add(x); end + + sig { type_parameters(:U).params(x: T.type_parameter(:U)).returns(T.type_parameter(:U)) } + def find(x); end + + sig { returns(T::Array[T.untyped]) } + def sets; end + + sig { params(x: T.untyped, y: T.untyped).returns(PDF::Reader::DisjointSet) } + def union(x, y); end + end + class Encoding CONTROL_CHARS = T.let(T.unsafe(nil), T::Array[Integer]) UNKNOWN_CHAR = T.let(T.unsafe(nil), Integer) @@ -931,6 +963,9 @@ module PDF sig { returns(T::Hash[Symbol, PDF::Reader::Rectangle]) } def rectangles; end + sig { params(opts: T::Hash[Symbol, T.untyped]).returns(T::Array[String]) } + def paragraphs(opts = {}); end + sig { returns(T::Hash[Symbol, T.untyped]) } def root; end @@ -1198,6 +1233,17 @@ module PDF OPERATORS = T.let(T.unsafe(nil), T::Hash[String, Symbol]) end + class Paragraph + sig { returns(String) } + attr_reader :text + + sig { returns(PDF::Reader::Point) } + attr_reader :origin + + sig { params(text: String, origin: PDF::Reader::Point).void } + def initialize(text, origin); end + end + class Parser sig { params(buffer: PDF::Reader::Buffer, objects: T.nilable(PDF::Reader::ObjectHash)).void } def initialize(buffer, objects=nil); end @@ -1577,6 +1623,9 @@ module PDF sig { params(other_run: T.untyped).returns(Numeric) } def intersection_area_percent(other_run); end + sig { params(other_run: T.untyped).returns(Numeric) } + def horizontal_overlap(other_run); end + sig { returns(Numeric) } def area; end diff --git a/spec/disjoint_set_spec.rb b/spec/disjoint_set_spec.rb new file mode 100644 index 00000000..f2a320a0 --- /dev/null +++ b/spec/disjoint_set_spec.rb @@ -0,0 +1,97 @@ +# typed: false +# coding: utf-8 + +describe PDF::Reader::DisjointSet do + let(:set) { PDF::Reader::DisjointSet.new } + + describe "#add" do + it "adds a new item to the set" do + set.add(5) + expect(set.length).to eq(1) + expect(set.contains(5)).to be_truthy + end + end + + describe "#each" do + let(:set) do + set = PDF::Reader::DisjointSet.new + set.add(1) + set.add(2) + set.add(3) + set.union(1, 2) + end + + it "iterates over each item in the set (even if unions are created)" do + expect(set.each.to_a).to eq([1, 2, 3]) + end + + it "is used by Enumerable to provide iterative functionality like #map" do + result = set.map { |x| x.to_s } + expect(result).to eq(['1', '2', '3']) + end + end + + describe "#find" do + it "finds the parent of the item" do + set.add("parent") + set.add("child") + set.union("parent", "child") + expect(set.find("parent")).to eq("parent") + expect(set.find("child")).to eq("parent") + end + + it "returns the item if it is a parent" do + set.add("item") + expect(set.find("item")).to eq("item") + end + end + + describe "#sets" do + it "returns an array of arrays containing the sets" do + set.add("parent") + set.add("child") + set.add("unrelated") + set.union("parent", "child") + expect(set.sets).to eq([["parent", "child"], ["unrelated"]]) + end + end + + describe "#union" do + let(:set) do + set = PDF::Reader::DisjointSet.new + set.add("parent") + set.add("child") + set.add("grandchild") + set.add("unrelated") + end + + it "handles multiple unions" do + set.union("parent", "child") + set.union("child", "grandchild") + expect(set.sets).to eq([["parent", "child", "grandchild"], ["unrelated"]]) + end + + it "handles union params regardless of order" do + set.union("child", "parent") + set.union("grandchild", "child") + expect(set.sets).to eq([["parent", "child", "grandchild"], ["unrelated"]]) + end + + it "gracefully handles union of identical elements" do + set.union("child", "child") + expect(set.sets).to eq([["parent"], ["child"], ["grandchild"], ["unrelated"]]) + end + + it "handles joining multiple previous unions" do + set = PDF::Reader::DisjointSet.new + set.add("parent1") + set.add("child1") + set.add("parent2") + set.add("child2") + set.union("parent1", "child1") + set.union("parent2", "child2") + set.union("parent1", "parent2") + expect(set.sets).to eq([["parent1", "child1", "parent2", "child2"]]) + end + end +end diff --git a/spec/page_spec.rb b/spec/page_spec.rb index 1119ddf7..50d60d6c 100644 --- a/spec/page_spec.rb +++ b/spec/page_spec.rb @@ -110,6 +110,42 @@ end end + describe "#paragraphs page 1" do + let!(:page) { browser.page(1) } + + context "of cairo-basic.pdf" do + let!(:browser) { PDF::Reader.new(pdf_spec_file("cairo-basic")) } + + it "returns the text content" do + expect(page.paragraphs).to eql(["Hello James"]) + end + end + + context "of all_page_boxes_exist.pdf" do + let!(:browser) { PDF::Reader.new(pdf_spec_file("all_page_boxes_exist")) } + + it "returns headlines as their own paragraph" do + expect(page.paragraphs).to include("PDF Automation") + end + + it "returns actual paragraphs" do + expect(page.paragraphs).to include(<<~TEXT.strip.gsub(/\n/, " ")) + PDF page boxes include Media Box, Trim Box and Bleed Box. Imposition + in the Sheridan work flow requires a Trim Box and a Bleed Box where + bleeds are present with a consistent Media Box. + TEXT + end + + it "returns paragraphs from multi-column layouts" do + expect(page.paragraphs).to include(<<~TEXT.strip.gsub(/\n/, " ")) + QuarkXPress Enter your trim size of the Width and the Height. Elements that bleed + must extend .125" (1/8") beyond the project’s trim edge in your project + layout. + TEXT + end + end + end + describe "#walk" do context "with page 1 of cairo-basic.pdf" do