1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
require 'roo/excelx/extractor'
module Roo
class Excelx
class SharedStrings < Excelx::Extractor
COMMON_STRINGS = {
t: "t",
r: "r",
html_tag_open: "<html>",
html_tag_closed: "</html>"
}
def [](index)
to_a[index]
end
def to_a
@array ||= extract_shared_strings
end
def to_html
@html ||= extract_html
end
# Use to_html or to_a for html returns
# See what is happening with commit???
def use_html?(index)
to_html[index][/<([biu]|sup|sub)>/]
end
private
def fix_invalid_shared_strings(doc)
invalid = { '_x000D_' => "\n" }
xml = doc.to_s
return doc unless xml[/#{invalid.keys.join('|')}/]
::Nokogiri::XML(xml.gsub(/#{invalid.keys.join('|')}/, invalid))
end
def extract_shared_strings
return [] unless doc_exists?
document = fix_invalid_shared_strings(doc)
# read the shared strings xml document
document.xpath('/sst/si').map do |si|
shared_string = ''
si.children.each do |elem|
case elem.name
when 'r'
elem.children.each do |r_elem|
shared_string << r_elem.content if r_elem.name == 't'
end
when 't'
shared_string = elem.content
end
end
shared_string
end
end
def extract_html
return [] unless doc_exists?
fix_invalid_shared_strings(doc)
# read the shared strings xml document
doc.xpath('/sst/si').map do |si|
html_string = '<html>'
si.children.each do |elem|
case elem.name
when 'r'
html_string << extract_html_r(elem)
when 't'
html_string << elem.content
end # case elem.name
end # si.children.each do |elem|
html_string << '</html>'
end # doc.xpath('/sst/si').map do |si|
end # def extract_html
# The goal of this function is to take the following XML code snippet and create a html tag
# r_elem ::: XML Element that is in sharedStrings.xml of excel_book.xlsx
# {code:xml}
# <r>
# <rPr>
# <i/>
# <b/>
# <u/>
# <vertAlign val="subscript"/>
# <vertAlign val="superscript"/>
# </rPr>
# <t>TEXT</t>
# </r>
# {code}
#
# Expected Output ::: "<html><sub|sup><b><i><u>TEXT</u></i></b></sub|/sup></html>"
def extract_html_r(r_elem)
str = ''
xml_elems = {
sub: false,
sup: false,
b: false,
i: false,
u: false
}
b, i, u, sub, sup = false, false, false, false, false
r_elem.children.each do |elem|
case elem.name
when 'rPr'
elem.children.each do |rPr_elem|
case rPr_elem.name
when 'b'
# set formatting for Bold to true
xml_elems[:b] = true
when 'i'
# set formatting for Italics to true
xml_elems[:i] = true
when 'u'
# set formatting for Underline to true
xml_elems[:u] = true
when 'vertAlign'
# See if the Vertical Alignment is subscript or superscript
case rPr_elem.xpath('@val').first.value
when 'subscript'
# set formatting for Subscript to true and Superscript to false ... Can't have both
xml_elems[:sub] = true
xml_elems[:sup] = false
when 'superscript'
# set formatting for Superscript to true and Subscript to false ... Can't have both
xml_elems[:sup] = true
xml_elems[:sub] = false
end
end
end
when 't'
str << create_html(elem.content, xml_elems)
end
end
str
end # extract_html_r
# This will return an html string
def create_html(text, formatting)
tmp_str = ''
formatting.each do |elem, val|
tmp_str << "<#{elem}>" if val
end
tmp_str << text
reverse_format = Hash[formatting.to_a.reverse]
reverse_format.each do |elem, val|
tmp_str << "</#{elem}>" if val
end
tmp_str
end
end # class SharedStrings < Excelx::Extractor
end # class Excelx
end # module Roo
|