summaryrefslogtreecommitdiffstats
path: root/lib/roo/excelx/shared_strings.rb
blob: e70b62381c9f9f2d6194b91e80fcc491b4201deb (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# frozen_string_literal: true

require 'roo/excelx/extractor'

module Roo
  class Excelx
    class SharedStrings < Excelx::Extractor
      def [](index)
        to_a[index]
      end

      def to_a
        @array ||= extract_shared_strings
      end

      def to_html
        @html ||= extract_html
      end

      # Use to_html or to_a for html returns
      # See what is happening with commit???
      def use_html?(index)
        return false if @options[:disable_html_wrapper]
        to_html[index][/<([biu]|sup|sub)>/]
      end

      private

      def fix_invalid_shared_strings(doc)
        invalid = { '_x000D_'  => "\n" }
        xml = doc.to_s
        return doc unless xml[/#{invalid.keys.join('|')}/]

        ::Nokogiri::XML(xml.gsub(/#{invalid.keys.join('|')}/, invalid))
      end

      def extract_shared_strings
        return [] unless doc_exists?

        document = fix_invalid_shared_strings(doc)
        # read the shared strings xml document
        document.xpath('/sst/si').map do |si|
          shared_string = +""
          si.children.each do |elem|
            case elem.name
            when 'r'
              elem.children.each do |r_elem|
                shared_string << r_elem.content if r_elem.name == 't'
              end
            when 't'
              shared_string = elem.content
            end
          end
          shared_string
        end
      end

      def extract_html
        return [] unless doc_exists?
        fix_invalid_shared_strings(doc)
        # read the shared strings xml document
        doc.xpath('/sst/si').map do |si|
          html_string = '<html>'.dup
          si.children.each do |elem|
            case elem.name
            when 'r'
              html_string << extract_html_r(elem)
            when 't'
              html_string << elem.content
            end # case elem.name
          end # si.children.each do |elem|
          html_string << '</html>'
        end # doc.xpath('/sst/si').map do |si|
      end # def extract_html

      # The goal of this function is to take the following XML code snippet and create a html tag
      # r_elem ::: XML Element that is in sharedStrings.xml of excel_book.xlsx
      # {code:xml}
      # <r>
      #   <rPr>
      #      <i/>
      #      <b/>
      #      <u/>
      #      <vertAlign val="subscript"/>
      #      <vertAlign val="superscript"/>
      #   </rPr>
      #   <t>TEXT</t>
      # </r>
      # {code}
      #
      # Expected Output ::: "<html><sub|sup><b><i><u>TEXT</u></i></b></sub|/sup></html>"
      def extract_html_r(r_elem)
        str = +""
        xml_elems = {
          sub: false,
          sup: false,
          b:   false,
          i:   false,
          u:   false
        }
        r_elem.children.each do |elem|
          case elem.name
          when 'rPr'
            elem.children.each do |rPr_elem|
              case rPr_elem.name
              when 'b'
                # set formatting for Bold to true
                xml_elems[:b] = true
              when 'i'
                # set formatting for Italics to true
                xml_elems[:i] = true
              when 'u'
                # set formatting for Underline to true
                xml_elems[:u] = true
              when 'vertAlign'
                # See if the Vertical Alignment is subscript or superscript
                case rPr_elem.xpath('@val').first.value
                when 'subscript'
                  # set formatting for Subscript to true and Superscript to false ... Can't have both
                  xml_elems[:sub] = true
                  xml_elems[:sup] = false
                when 'superscript'
                  # set formatting for Superscript to true and Subscript to false ... Can't have both
                  xml_elems[:sup] = true
                  xml_elems[:sub] = false
                end
              end
            end
          when 't'
            str << create_html(elem.content, xml_elems)
          end
        end
        str
      end # extract_html_r

      # This will return an html string
      def create_html(text, formatting)
        tmp_str = +""
        formatting.each do |elem, val|
          tmp_str << "<#{elem}>" if val
        end
        tmp_str << text

        formatting.reverse_each do |elem, val|
          tmp_str << "</#{elem}>" if val
        end
        tmp_str
      end
    end # class SharedStrings < Excelx::Extractor
  end # class Excelx
end # module Roo