Skip to content

Commit 01b1665

Browse files
authored
Defer building HtmlPage id/name lookup index until first read (#1116)
* Defer building HtmlPage id/name lookup index until first read Every element added during parsing eagerly invoked addMappedElement, which issued two getAttribute("id"/"name") lookups, two ConcurrentHashMap.put operations, and (when applicable) allocated a MappedElementIndexEntry plus its inner ArrayList. For pages parsed and walked via querySelector / iteration without any getElementById call, the entire index is built and never read. Add a mappedElementsBuilt_ flag on HtmlPage; addMappedElement and removeMappedElement early-return until it flips. A new private ensureMappedElementsBuilt() walks the document once on first read of getElementById / getElementsById / getElementByName / getElementsByName / getElementsByIdAndOrName and populates both idMap_ and nameMap_; subsequent mutations through addMappedElement/removeMappedElement stay incremental as before. clone() resets the flag so cloned pages re-lazy-build. JMH HtmlUnitBenchmark.JMH on a ~30 KB HTML page (5 forks x 3 warmup x 5 measurement = 25 samples, paired sequentially): baseline: 633.205 +- 8.793 us/op with this: 593.611 +- 13.110 us/op delta: -39.6 us, -6.3% Targeted tests across HtmlPageTest, HtmlPage2Test, HtmlPage3Test, HtmlInlineFrameTest, HtmlFrameTest, HtmlFrameSetTest, HTMLDocumentTest, Html*ParserTest: 1176 tests, 0 failures. * Set mappedElementsBuilt_ after populating the maps Per review feedback: the flag was being flipped before addElement() ran, so a partial failure mid-walk would leave the page with built_=true and a half-populated index, and subsequent reads would silently return incomplete results. Move the assignment after the populate so a thrown exception leaves built_=false and the next read retries.
1 parent 8e3d9cf commit 01b1665

1 file changed

Lines changed: 34 additions & 0 deletions

File tree

src/main/java/org/htmlunit/html/HtmlPage.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ public class HtmlPage extends SgmlPage {
154154

155155
private Map<String, MappedElementIndexEntry> idMap_ = new ConcurrentHashMap<>();
156156
private Map<String, MappedElementIndexEntry> nameMap_ = new ConcurrentHashMap<>();
157+
// The id/name lookup index is built lazily on first use. Until then,
158+
// notifyNodeAdded / fireAttributeChange skip the per-element index updates.
159+
// Reads must call ensureMappedElementsBuilt() before consulting idMap_/nameMap_.
160+
private boolean mappedElementsBuilt_;
157161

158162
private List<BaseFrameElement> frameElements_ = new ArrayList<>();
159163
private int parserCount_;
@@ -631,6 +635,7 @@ public ProcessingInstruction createProcessingInstruction(final String namespaceU
631635
@Override
632636
public DomElement getElementById(final String elementId) {
633637
if (elementId != null) {
638+
ensureMappedElementsBuilt();
634639
final MappedElementIndexEntry elements = idMap_.get(elementId);
635640
if (elements != null) {
636641
return elements.first();
@@ -1708,6 +1713,7 @@ public <E extends HtmlElement> E getHtmlElementById(final String elementId) thro
17081713
*/
17091714
public List<DomElement> getElementsById(final String elementId) {
17101715
if (elementId != null) {
1716+
ensureMappedElementsBuilt();
17111717
final MappedElementIndexEntry elements = idMap_.get(elementId);
17121718
if (elements != null) {
17131719
return new ArrayList<>(elements.elements());
@@ -1728,6 +1734,7 @@ public List<DomElement> getElementsById(final String elementId) {
17281734
@SuppressWarnings("unchecked")
17291735
public <E extends DomElement> E getElementByName(final String name) throws ElementNotFoundException {
17301736
if (name != null) {
1737+
ensureMappedElementsBuilt();
17311738
final MappedElementIndexEntry elements = nameMap_.get(name);
17321739
if (elements != null) {
17331740
return (E) elements.first();
@@ -1746,6 +1753,7 @@ public <E extends DomElement> E getElementByName(final String name) throws Eleme
17461753
*/
17471754
public List<DomElement> getElementsByName(final String name) {
17481755
if (name != null) {
1756+
ensureMappedElementsBuilt();
17491757
final MappedElementIndexEntry elements = nameMap_.get(name);
17501758
if (elements != null) {
17511759
return new ArrayList<>(elements.elements());
@@ -1765,6 +1773,7 @@ public List<DomElement> getElementsByIdAndOrName(final String idAndOrName) {
17651773
if (idAndOrName == null) {
17661774
return Collections.emptyList();
17671775
}
1776+
ensureMappedElementsBuilt();
17681777
final MappedElementIndexEntry list1 = idMap_.get(idAndOrName);
17691778
final MappedElementIndexEntry list2 = nameMap_.get(idAndOrName);
17701779
final List<DomElement> list = new ArrayList<>();
@@ -1841,12 +1850,32 @@ void notifyNodeRemoved(final DomNode node) {
18411850
* @param recurse indicates if children must be added too
18421851
*/
18431852
void addMappedElement(final DomElement element, final boolean recurse) {
1853+
// Index is built lazily; skip while not built. ensureMappedElementsBuilt()
1854+
// walks the tree once and populates everything on first read.
1855+
if (!mappedElementsBuilt_) {
1856+
return;
1857+
}
18441858
if (isAncestorOf(element)) {
18451859
addElement(idMap_, element, DomElement.ID_ATTRIBUTE, recurse);
18461860
addElement(nameMap_, element, DomElement.NAME_ATTRIBUTE, recurse);
18471861
}
18481862
}
18491863

1864+
private void ensureMappedElementsBuilt() {
1865+
if (mappedElementsBuilt_) {
1866+
return;
1867+
}
1868+
final DomElement root = getDocumentElement();
1869+
if (root != null) {
1870+
addElement(idMap_, root, DomElement.ID_ATTRIBUTE, true);
1871+
addElement(nameMap_, root, DomElement.NAME_ATTRIBUTE, true);
1872+
}
1873+
// Flip the flag only after the maps are populated, so a partial
1874+
// failure mid-walk leaves us with built_=false and the next read
1875+
// tries again rather than seeing a half-populated index.
1876+
mappedElementsBuilt_ = true;
1877+
}
1878+
18501879
private void addElement(final Map<String, MappedElementIndexEntry> map, final DomElement element,
18511880
final String attribute, final boolean recurse) {
18521881
final String value = element.getAttribute(attribute);
@@ -1882,6 +1911,10 @@ private void addElement(final Map<String, MappedElementIndexEntry> map, final Do
18821911
* @param descendant indicates of the element was descendant of this HtmlPage, but now its parent might be null
18831912
*/
18841913
void removeMappedElement(final DomElement element, final boolean recurse, final boolean descendant) {
1914+
// see addMappedElement: while the index is unbuilt, removals are also no-ops.
1915+
if (!mappedElementsBuilt_) {
1916+
return;
1917+
}
18851918
if (descendant || isAncestorOf(element)) {
18861919
removeElement(idMap_, element, DomElement.ID_ATTRIBUTE, recurse);
18871920
removeElement(nameMap_, element, DomElement.NAME_ATTRIBUTE, recurse);
@@ -1998,6 +2031,7 @@ protected HtmlPage clone() {
19982031

19992032
result.idMap_ = new ConcurrentHashMap<>();
20002033
result.nameMap_ = new ConcurrentHashMap<>();
2034+
result.mappedElementsBuilt_ = false;
20012035

20022036
return result;
20032037
}

0 commit comments

Comments
 (0)