From 46beee1788214e0b1511da40d58a879d333ab50c Mon Sep 17 00:00:00 2001 From: Yousuf Jukaku Date: Wed, 16 Oct 2019 15:31:21 -0400 Subject: [PATCH] Allow access to other XML docs in docx file like the headers and footers --- lib/docx/document.rb | 59 +++++++++++++++++++++++++++++++---- spec/docx/document_spec.rb | 16 ++++++++++ spec/fixtures/multi_doc.docx | Bin 0 -> 6282 bytes 3 files changed, 69 insertions(+), 6 deletions(-) create mode 100644 spec/fixtures/multi_doc.docx diff --git a/lib/docx/document.rb b/lib/docx/document.rb index 3102da4..89eb764 100755 --- a/lib/docx/document.rb +++ b/lib/docx/document.rb @@ -18,15 +18,24 @@ module Docx # puts d.text # end class Document - attr_reader :xml, :doc, :zip, :styles + + # A path with * indicates that there are possibly multiple documents + # matching that glob, eg. word/header1.xml, word/header2.xml + DOCUMENT_PATHS = { + doc: "word/document.xml", + styles: "word/styles.xml", + headers: "word/header*.xml", + footers: "word/footer*.xml", + numbering: "word/numbering.xml" + } + + attr_reader :xml, :doc, :zip, :styles, :headers, :footers, :numbering def initialize(path, &block) @replace = {} @zip = Zip::File.open(path) - @document_xml = @zip.read('word/document.xml') - @doc = Nokogiri::XML(@document_xml) - @styles_xml = @zip.read('word/styles.xml') - @styles = Nokogiri::XML(@styles_xml) + extract_documents + if block_given? yield self @zip.close @@ -57,6 +66,8 @@ def paragraphs def bookmarks bkmrks_hsh = Hash.new bkmrks_ary = @doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node } + bkmrks_ary += @headers.values.map { |xml_doc| xml_doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node } }.flatten + bkmrks_ary += @footers.values.map { |xml_doc| xml_doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node } }.flatten # auto-generated by office 2010 bkmrks_ary.reject! {|b| b.name == "_GoBack" } bkmrks_ary.each {|b| bkmrks_hsh[b.name] = b } @@ -123,13 +134,49 @@ def replace_entry(entry_path, file_contents) private + def extract_documents + DOCUMENT_PATHS.each do |attr_name, path| + if path.match /\*/ + extract_multiple_documents_from_globbed_path(attr_name, path) + else + extract_single_document_from_path(attr_name, path) + end + end + end + + def extract_single_document_from_path(attr_name, path) + if @zip.find_entry(path) + xml_doc = @zip.read(path) + self.instance_variable_set(:"@#{attr_name}", Nokogiri::XML(xml_doc)) + end + end + + def extract_multiple_documents_from_globbed_path(hash_attr_name, glob_path) + files = @zip.glob(glob_path).map { |h| h.name } + filename_and_contents_pairs = files.map do |file| + simple_file_name = file.sub(/^word\//, "").sub(/\.xml$/, "") + [simple_file_name, Nokogiri::XML(@zip.read(file))] + end + hash = Hash[filename_and_contents_pairs] + self.instance_variable_set(:"@#{hash_attr_name}", hash) + end + #-- # TODO: Flesh this out to be compatible with other files # TODO: Method to set flag on files that have been edited, probably by inserting something at the # end of methods that make edits? #++ def update - replace_entry "word/document.xml", doc.serialize(:save_with => 0) + DOCUMENT_PATHS.each do |attr_name, path| + if path.match /\*/ + self.instance_variable_get("@#{attr_name}").each do |simple_file_name, contents| + replace_entry("word/#{simple_file_name}.xml", contents.serialize(:save_with => 0)) + end + else + xml_document = self.instance_variable_get("@#{attr_name}") + replace_entry path, xml_document.serialize(:save_with => 0) if xml_document + end + end end # generate Elements::Containers::Paragraph from paragraph XML node diff --git a/spec/docx/document_spec.rb b/spec/docx/document_spec.rb index a1f76dc..f9f6e6f 100755 --- a/spec/docx/document_spec.rb +++ b/spec/docx/document_spec.rb @@ -279,6 +279,22 @@ end end + describe 'multiple documents' do + before do + @doc = Docx::Document.open(@fixtures_path + '/multi_doc.docx') + end + + it 'should extract all inner documents' do + expect(@doc.doc).to_not be_nil + expect(@doc.styles).to_not be_nil + expect(@doc.headers).to_not be_nil + expect(@doc.headers["header1"].text).to eq "Hello from the header." + expect(@doc.footers).to_not be_nil + expect(@doc.footers["footer1"].text).to eq "Hello from the footer." + expect(@doc.numbering).to_not be_nil + end + end + describe 'saving' do before do @doc = Docx::Document.open(@fixtures_path + '/saving.docx') diff --git a/spec/fixtures/multi_doc.docx b/spec/fixtures/multi_doc.docx new file mode 100644 index 0000000000000000000000000000000000000000..008d06eadeff3b85691cf1d98670c7273e4fb0e0 GIT binary patch literal 6282 zcmbVQ1yodBy9NfNduXLW5G16fq#I$79J*uZZb@lDx}>{PK*=GcL6DG^5D*FJx`Y4s zDZ1|OUF*KHX4dR8=Q(@!+wY?!2S5bEp`f6^K~t5U!rc;d*mpe#6Nn=#%gtxmD+M`# z2uAR?O9HW-IS5lo@6Zvos4ev&a$xfI&d3llheg@-ya-y8QdnlsJNJX#yJOZvSDA`w z;o&0CVha&_w$UawOSf4;Rto)b)+TkK=nt~V+7F871G5eAWL}1J$!`~NvfCG?Qkgyl z^#~YJ*A&Fq&($9%`98|bV4s+@>;=C?s5_Uw;NkWltfmd5H`UG8uBhQAz~^F6<6hTL)v-|ER^-*2vk~#Kwum%^LC_6~*>j zLfLR6j!)6p&$V1;fH0+iBP(UxydnZ#22j^@j2erVyR>x~%i z9P9utgfM30V99Ayw)E%2XD8q9oLj2+EHW_aqFyc{gxkvFjNmuA4h3u~_eX2KpnqLE zX(gk47R6uGV*J7@b)A1Fkz8rcy;_B)Eh7a%!`u0U!gjVkpd&z)C^DFw(k*ivIe(*1J(%Lo@NCJ(yyW0Dn^*?!T&YG;wmW zurYJIQJO@Q999P#hWK$B^*%XX^oeTjb_1N<^AXa*xG7}_gPli5*y_e$#JtV5?5IFh zZ`)@BAxHh_$2-VP?z|aT$V4V)@Kh+IQ%0dWTs)Jqu025G zZjz#ErbJ+Ew?hq0q7c!JQm&#(E&93|caUFZSt7-naW}oum+0C}=6h2V8+)YW%1@UZ zo(YkPKH=7C-^K-=~SVMvt%}rQ4hCixeTlq9o@EXm4S+=L~OkI~S;Q^#BK1#8K=;uK71`*_1H=EkBuD5r_V&D1f zg14#puaI6vVW)V(-&i8%(@>@?JRICA;@>Rs)7x%`=!Dv=P3jXI?{Dgv-cR9rB0<2k z#6neaQi;fOcy#8<&3R7#Df`xznmZA((gcxt4Dk;uT!#meIiPTk-Uqp=wYZCUTzqoF z_=Kn`GqNf4X2Ku!hRZcR$E1caS&0w1bkw9y6l__mO;?Y1=!;`?n{k`nOBzTl8~K!B zq3M5E-(54dIWSqr+*MiG4se*HZeor?E^yomq$;h?e`4XV?`^+ZZ9j5S9D! z&Dd#-^}(AHvNFlHU(*V~=u|KmN7WJ~&_y1P4swfW`w16zMx!4$wuckq`Ug)Kx6F53apcm2Z9ECp3 z9$U32)}7q8_R8{gN}l8DMqmZbp@;*KoFNj&q6Q^88gp0c@JIbLJ8>cP)4?Tab1zB> z!T|{Fl&>e-;dKpWG1Rxz&O%xQ9z4!N40`|?Ga`(rga@ z5L|Q8yOa^g!de8#npv zI#dHfmsaZR(&?g`J>fc^ z_IE6J@;tJ=KQlQdUm)Bl(@{$4H?ruB_B}J8HO=!PkJWg(X zj*%~@Man zyZ60~$>}XC0?$|1gtk2ui#=b)Z>93fdXO<2w`)cO11JvJ~W#TKD zsnV8+=Zr#aOe@OYsHlx05k4+2c%$#o6getebok-GxT%9P2GG(if9}9yp8Y*X!voAx zUAM@X05StDBee+cnS#zo z^EZu<=tIXER*NO=@3d|9w! zF>)d}QEhNVVb;h`pAf237cRxC=d2J;+s;m%lc*WsQ>jK^8P5(GP4sM9?96R3sWy=| z>DOWNmJdG2rB#?aZ!YeAde+P}df2%3Wcazyz?WSw#p{a`)V41pFCn?4$Bd=qG4}fP z)?S??Y*2VO=z^~Zn*gaMN%<$DKD!NBJz~k0_@(#W?CA!dWzA56@ySZMy<@IttC-TM zh{4Dy_Ik8XeD%j-IGP2i_ImPW4Gwu6fqu@}B*zk0wlwOG155|r?|$wKJ)$5!(&CBm z)fM)St)L)LL{Et}TX?^<)E0A&JW$sb#`U}n&5`wa%!XPm6^a&4Bo<9#nYk3DxybiN z5fOFS_rx-TMyuFfjfJ@&_!Ek&boBBw%`oBAQl4Q+x@e9{;+Pva5k(+YXL|!+KLgSk zr8;6)QC2WMx@fMQwinl?X$i@JiH4#`%%Qe^!cTR0H9XXd7WKdAthWT@x!zmGN|<-1 z=GXA}xVH(rXqI<#(MvJz$|;PUqj_qN=iMZOmBGIJ=h)iC)!MdKge_@=8@)`k2Z}C&M%{nW;jGj7ul#*lVSe|9!>#jYoFp z<9tVYk?=V3puE&=n>o55vE{BA`slGc&W7%Ez#1l1M?~}>u)Pv0P-3tPxRKiyr#6&K zi*a8WPSQMs368+=3cuej4&uinYfdyTHEI+h`e?murkMLk-0FIB;emVcR)W)4ILCmt zpgJF9*(h&zbk~_9F;T9q`?@ zEkOEPl<5^o(vlmLYI{x@ZH(?=EaMzJuOH_&#}rYZV;Z7c{1#qfGYX?05GAZZHrj9z z(L0n3U8>XWOg+gPaAO?uA#_STSYhq@Ya92|IOZk>#(yymLEQhxIG65+;-Sv+>+=4D zjDl!#JkxVca>aK;iL6qe$dY{sU5!^7D1SDH%M7&X5}{K9auKOy#v-U_@tM71OB07D z&hjpYdp2*YS@K_yS026^|^6VZ1 z-;rwRq?!EPhr|u*X#}P+>vuJeb5?C>X4(1hzY^?2Xm+{tzhVN1!*kH*TMQ)!YWKNk z&_#_DIlB_&ONf2A^B0o#csvSCzD7Y8Yl=>mxYZN+&nYUdeSuGPz?4CE@)qUX$w zQp~x8>tXLKHu9`jk!|()NN)o`^{+RNJ&HT|qnwwQ`K?yrs^%kKmuc4O_U=0_gUaVWF66pWZH#|yNRDj-+T~u zt}9i$Vo&HuG3VCD6%A$tNr`oXdiT$A^CSxQ;;Jz=`!&rTIfvR$=)9Y& znbB?Rd1d0jhd@~SdCORp?xXNaGAEDw?Mp6s&mDufbj?d0)L&vNR3j>+dwmM0vJ(+w zCD``75In-`IcGZin1b7vhxKrA?0qX!-u*fmfj&QX!=icG7UoR*hKd^17+ISXyg=;i zT?w(slo#`QZ3=ySb%Tz8c^@s6S^gxz7MfMDjlV6k&9Y)2sRK^xSMDptDzdUjQ-mEd z>8h;hbn}CT%wg`u<6R$`nhC{YVNh{?9b$=`)~T#P$Gb=nT0qQWS7)6pH>~f_Z%-=1 z=RNf*+4#%l2>3bz$!fL3+s(eatyV6vz|j!RG0Aj|kmaSJo|QNWQ(Rf9hm<%yw~Vme zowHpz&rz6rWJHGPURjo?KsG^tRIo*Cu!=FlK(~0FWL;Iu7(cdqy+lZ?gH2uo9U1C4 z#~k$HCQA-L-j(W03W*TnDb&c3qFI*iXtY37X|R@t_u2@WqKfY-Dtb+os-&#oO=p1d zeHhv7YUbnsk9INRe%KoaPVna=uGNbRtBQwoSnKyfldIzDgObs%gp*Ni*Y}|oCHocr^+~Ij42oeJ%HK z2)hiDsCdCRcTYs;{H`J!9#3D3HX46CPaZzYzWZ{Kl%5NOj+oP0m5>m1$Oa#Z=IWz< zB2#3q|DlPM%2q7L;9%A-K?Lu!$i<1`w5cmEdDwPR-aNCt)5{YCQ{_ouTcWaTf4%uw z+EJYowGkY&IiBir)#1t;c;Ni28E?9b!Ow?J;bSy5+T0=7T7iR^J z(_WuXKzx);YLT~$^UOkM8N;3&$V%%BjP!fpE@bz*=OZyOVRASjzF0d|=*C=)4>ALj zROq{gvSQbW ztq(KQz7|$}7%b+^6yWC)ZA4lV5pRF)vg)zse6ic0o9Pwz>TOakdvbYSAfD*`FP_1^ z&9)eC8>-!*uSG(Tcy03bmV)n{(%{f zc4>^&Q_}Dgs+ZuIR}UK#%G~;hghK&&u`@_C)YedcPb$4Ggp)%%H)viBE@Z%H zUw>sk)J}w{H=nvw${^G5g6>K9J87o=9asO((xly%36S{v9(6VE>{U+?$i}L2l)R$w z`QA4fTb4pWMZQK(#0P~J3nM7(AV+o)-yTQjGCHTM*i>vq1nSdKOst`1Tuc9<&UZlv zW*eoRVqTCdcMhlflIF9iP?o{F2#muNxivz|654@mO`V>!E4n@aQJuh6jvb1y28$&; znySM~)Tc8RcI139R|7A&7zql~MBPx7Df$vv(%8aZT;DHh#Y3iB@R?b0y@?bs8`hLG z%n&O`H)#qU0SNbteZ9>>-mtH~%PlVUPvC71?uMlNC3vu{{zh5;`J~%S*G;(o5(Sv& z{CLuB2JcV!ZF=Tr!2Kocu)gvK{5Nv!PyFrC`Se5 z-Jjsw`S