From 07e36bfa9d4dd9772e7897ae36aea7ad509e3533 Mon Sep 17 00:00:00 2001 From: Jason Woods Date: Mon, 24 Jul 2023 20:33:49 +1000 Subject: [PATCH] Added basic parsing of document headers and footers --- README.md | 3 ++ bin/mammoth | 4 +++ lib/document-to-html.js | 29 ++++++++++++++++++-- lib/documents.js | 26 ++++++++++++++++-- lib/docx/document-xml-reader.js | 4 ++- lib/docx/docx-reader.js | 44 +++++++++++++++++++++++------- lib/docx/header-footer-reader.js | 21 ++++++++++++++ lib/index.d.ts | 1 + lib/main.js | 4 ++- lib/options-reader.js | 5 +++- test/document-to-html.tests.js | 24 ++++++++++++++++ test/mammoth.tests.js | 8 ++++++ test/test-data/header-footer.docx | Bin 0 -> 15722 bytes 13 files changed, 154 insertions(+), 19 deletions(-) create mode 100644 lib/docx/header-footer-reader.js create mode 100644 test/test-data/header-footer.docx diff --git a/README.md b/README.md index b16733698..b4e019201 100644 --- a/README.md +++ b/README.md @@ -374,6 +374,9 @@ Converts the source document to HTML. * `ignoreEmptyParagraphs`: by default, empty paragraphs are ignored. Set this option to `false` to preserve empty paragraphs in the output. + * `includeHeadersAndFooters`: by default, headers and footers are not included in the output. + Set this option to `True` to include them at the start and end of the output. + * `idPrefix`: a string to prepend to any generated IDs, such as those used by bookmarks, footnotes and endnotes. diff --git a/bin/mammoth b/bin/mammoth index a908d0743..1ed02c0b0 100755 --- a/bin/mammoth +++ b/bin/mammoth @@ -34,5 +34,9 @@ parser.addArgument(["--style-map"], { help: "File containg a style map." }); +parser.addArgument(["--include-headers-footers"], { + type: "string", + help: "Include headers and footers from the document." +}); main(parser.parseArgs()); diff --git a/lib/document-to-html.js b/lib/document-to-html.js index 32e8b0300..df5189cb8 100644 --- a/lib/document-to-html.js +++ b/lib/document-to-html.js @@ -34,6 +34,7 @@ function DocumentConversion(options, comments) { options = _.extend({ignoreEmptyParagraphs: true}, options); var idPrefix = options.idPrefix === undefined ? "" : options.idPrefix; var ignoreEmptyParagraphs = options.ignoreEmptyParagraphs; + var includeHeadersAndFooters = options.includeHeadersAndFooters; var defaultParagraphStyle = htmlPaths.topLevelElement("p"); @@ -345,6 +346,24 @@ function DocumentConversion(options, comments) { } } + function convertHeader(headers, messages, options) { + if (!includeHeadersAndFooters) { + return []; + } + + var children = convertElements(headers.children, messages, options); + return Html.freshElement("header", {}, children); + } + + function convertFooter(footers, messages, options) { + if (!includeHeadersAndFooters) { + return []; + } + + var children = convertElements(footers.children, messages, options); + return Html.freshElement("footer", {}, children); + } + var elementConverters = { "document": function(document, messages, options) { var children = convertElements(document.children, messages, options); @@ -352,12 +371,14 @@ function DocumentConversion(options, comments) { return document.notes.resolve(noteReference); }); var notesNodes = convertElements(notes, messages, options); - return children.concat([ + var headers = convertElements(document.headers, messages, options); + var footers = convertElements(document.footers, messages, options); + return headers.concat(children).concat([ Html.freshElement("ol", {}, notesNodes), Html.freshElement("dl", {}, flatMap(referencedComments, function(referencedComment) { return convertComment(referencedComment, messages, options); })) - ]); + ]).concat(footers); }, "paragraph": convertParagraph, "run": convertRun, @@ -408,7 +429,9 @@ function DocumentConversion(options, comments) { "table": convertTable, "tableRow": convertTableRow, "tableCell": convertTableCell, - "break": convertBreak + "break": convertBreak, + "header": convertHeader, + "footer": convertFooter }; return { convertToHtml: convertToHtml diff --git a/lib/documents.js b/lib/documents.js index cc6a2a33c..07240a8f6 100644 --- a/lib/documents.js +++ b/lib/documents.js @@ -16,16 +16,20 @@ var types = exports.types = { tableRow: "tableRow", tableCell: "tableCell", "break": "break", - bookmarkStart: "bookmarkStart" + bookmarkStart: "bookmarkStart", + header: "header", + footer: "footer" }; function Document(children, options) { options = options || {}; return { type: types.document, - children: children, + children: children || [], notes: options.notes || new Notes({}), - comments: options.comments || [] + comments: options.comments || [], + headers: options.headers || [], + footers: options.footers || [] }; } @@ -221,6 +225,20 @@ function BookmarkStart(options) { }; } +function Header(children) { + return { + type: types.header, + children: children + }; +} + +function Footer(children) { + return { + type: types.footer, + children: children + }; +} + exports.document = exports.Document = Document; exports.paragraph = exports.Paragraph = Paragraph; exports.run = exports.Run = Run; @@ -240,5 +258,7 @@ exports.lineBreak = Break("line"); exports.pageBreak = Break("page"); exports.columnBreak = Break("column"); exports.BookmarkStart = BookmarkStart; +exports.header = exports.Header = Header; +exports.footer = exports.Footer = Footer; exports.verticalAlignment = verticalAlignment; diff --git a/lib/docx/document-xml-reader.js b/lib/docx/document-xml-reader.js index bb912e431..9426b5598 100644 --- a/lib/docx/document-xml-reader.js +++ b/lib/docx/document-xml-reader.js @@ -14,7 +14,9 @@ function DocumentXmlReader(options) { .map(function(children) { return new documents.Document(children, { notes: options.notes, - comments: options.comments + comments: options.comments, + headers: options.headers, + footers: options.footers }); }); return new Result(result.value, result.messages); diff --git a/lib/docx/docx-reader.js b/lib/docx/docx-reader.js index 5f0fd0547..2e6426fe4 100644 --- a/lib/docx/docx-reader.js +++ b/lib/docx/docx-reader.js @@ -17,6 +17,7 @@ var numberingXml = require("./numbering-xml"); var stylesReader = require("./styles-reader"); var notesReader = require("./notes-reader"); var commentsReader = require("./comments-reader"); +var extremityReader = require("./header-footer-reader"); var Files = require("./files").Files; @@ -58,6 +59,20 @@ function read(docxFile, input) { } else { return new Result([]); } + }), + headers: readXmlFileWithBody(result.partPaths.headers, result, function(bodyReader, xml) { + if (xml) { + return extremityReader.createHeaderReader(bodyReader)(xml); + } else { + return new Result([]); + } + }), + footers: readXmlFileWithBody(result.partPaths.footers, result, function(bodyReader, xml) { + if (xml) { + return extremityReader.createFooterReader(bodyReader)(xml); + } else { + return new Result([]); + } }) }; }).also(function(result) { @@ -72,12 +87,18 @@ function read(docxFile, input) { return readXmlFileWithBody(result.partPaths.mainDocument, result, function(bodyReader, xml) { return result.notes.flatMap(function(notes) { return result.comments.flatMap(function(comments) { - var reader = new DocumentXmlReader({ - bodyReader: bodyReader, - notes: notes, - comments: comments + return result.headers.flatMap(function(headers) { + return result.footers.flatMap(function(footers) { + var reader = new DocumentXmlReader({ + bodyReader: bodyReader, + notes: notes, + comments: comments, + headers: headers, + footers: footers + }); + return reader.convertXmlToDocument(xml); + }); }); - return reader.convertXmlToDocument(xml); }); }); }); @@ -103,13 +124,14 @@ function findPartPaths(docxFile) { readElement: relationshipsReader.readRelationships, defaultValue: relationshipsReader.defaultValue })(docxFile).then(function(documentRelationships) { - function findPartRelatedToMainDocument(name) { + function findPartRelatedToMainDocument(name, multiple) { return findPartPath({ docxFile: docxFile, relationships: documentRelationships, relationshipType: "http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name, basePath: zipfile.splitPath(mainDocumentPath).dirname, - fallbackPath: "word/" + name + ".xml" + fallbackPath: "word/" + name + ".xml", + multiple: multiple }); } @@ -119,13 +141,15 @@ function findPartPaths(docxFile) { endnotes: findPartRelatedToMainDocument("endnotes"), footnotes: findPartRelatedToMainDocument("footnotes"), numbering: findPartRelatedToMainDocument("numbering"), - styles: findPartRelatedToMainDocument("styles") + styles: findPartRelatedToMainDocument("styles"), + headers: findPartRelatedToMainDocument("header", true), + footers: findPartRelatedToMainDocument("footer", true) }; }); }); } -function findPartPath(options) { +function findPartPath(options, multiple) { var docxFile = options.docxFile; var relationships = options.relationships; var relationshipType = options.relationshipType; @@ -142,7 +166,7 @@ function findPartPath(options) { if (validTargets.length === 0) { return fallbackPath; } else { - return validTargets[0]; + return multiple === true ? validTargets : validTargets[0]; } } diff --git a/lib/docx/header-footer-reader.js b/lib/docx/header-footer-reader.js new file mode 100644 index 000000000..30bda7672 --- /dev/null +++ b/lib/docx/header-footer-reader.js @@ -0,0 +1,21 @@ +var documents = require("../documents"); +var Result = require("../results").Result; + +exports.createHeaderReader = createReader.bind(this, documents.header); +exports.createFooterReader = createReader.bind(this, documents.footer); + +function createReader(extremity, bodyReader) { + function readExtremityXml(element) { + var result = readElement(element, extremity); + return Array.isArray(result) ? Result.combine(result) : Result.combine([result]); + } + + function readElement(element, extremity) { + return bodyReader.readXmlElements(element.children) + .map(function(children) { + return extremity(children); + }); + } + + return readExtremityXml; +} diff --git a/lib/index.d.ts b/lib/index.d.ts index 465c8f95e..f04d17396 100644 --- a/lib/index.d.ts +++ b/lib/index.d.ts @@ -32,6 +32,7 @@ interface Options { includeDefaultStyleMap?: boolean; convertImage?: ImageConverter; ignoreEmptyParagraphs?: boolean; + includeHeadersAndFooters?: boolean; idPrefix?: string; transformDocument?: (element: any) => any; } diff --git a/lib/main.js b/lib/main.js index f6e149f08..dfdfb5ce8 100644 --- a/lib/main.js +++ b/lib/main.js @@ -13,11 +13,13 @@ function main(argv) { var outputDir = argv.output_dir; var outputFormat = argv.output_format; var styleMapPath = argv.style_map; + var includeHeadersAndFooters = argv.include_headers_footers !== null && argv.include_headers_footers.trim() === 'true'; readStyleMap(styleMapPath).then(function(styleMap) { var options = { styleMap: styleMap, - outputFormat: outputFormat + outputFormat: outputFormat, + includeHeadersAndFooters: includeHeadersAndFooters }; if (outputDir) { diff --git a/lib/options-reader.js b/lib/options-reader.js index 636f64877..a2f1a4587 100644 --- a/lib/options-reader.js +++ b/lib/options-reader.js @@ -51,7 +51,10 @@ var defaultStyleMap = exports._defaultStyleMap = [ "r[style-name='Hyperlink'] =>", - "p[style-name='Normal'] => p:fresh" + "p[style-name='Normal'] => p:fresh", + + "p.Header => p:fresh", + "p.Footer => p:fresh" ]; var standardOptions = exports._standardOptions = { diff --git a/test/document-to-html.tests.js b/test/document-to-html.tests.js index 4283e86e0..13b8f9774 100644 --- a/test/document-to-html.tests.js +++ b/test/document-to-html.tests.js @@ -822,3 +822,27 @@ test('when initials are blank then comment author label is blank', function() { assert.equal(commentAuthorLabel({authorInitials: undefined}), ""); assert.equal(commentAuthorLabel({authorInitials: null}), ""); }); + +test('docx header is converted to
', function() { + var headers = [new documents.Header( + [paragraphOfText("This is a header")] + )]; + var document = new documents.Document(); + document.headers = headers; + var converter = new DocumentConverter({includeHeadersAndFooters: true}); + return converter.convertToHtml(document).then(function(result) { + assert.equal(result.value, '

This is a header

'); + }); +}); + +test('docx footer is converted to