aboutsummaryrefslogtreecommitdiffstats
path: root/lib/roo/excelx.rb
blob: 91ebc1e0cbceeeb01da6d9400f6ef95f2eab283e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
require 'nokogiri'
require 'zip/filesystem'
require 'roo/link'
require 'roo/tempdir'
require 'roo/utils'
require 'forwardable'
require 'set'

module Roo
  class Excelx < Roo::Base
    extend Roo::Tempdir
    extend Forwardable

    ERROR_VALUES = %w(#N/A #REF! #NAME? #DIV/0! #NULL! #VALUE! #NUM!).to_set

    require 'roo/excelx/shared'
    require 'roo/excelx/workbook'
    require 'roo/excelx/shared_strings'
    require 'roo/excelx/styles'
    require 'roo/excelx/cell'
    require 'roo/excelx/sheet'
    require 'roo/excelx/relationships'
    require 'roo/excelx/comments'
    require 'roo/excelx/sheet_doc'
    require 'roo/excelx/coordinate'
    require 'roo/excelx/format'
    require 'roo/excelx/images'

    delegate [:styles, :workbook, :shared_strings, :rels_files, :sheet_files, :comments_files, :image_rels, :image_files] => :@shared
    ExceedsMaxError = Class.new(StandardError)

    # initialization and opening of a spreadsheet file
    # values for packed: :zip
    # optional cell_max (int) parameter for early aborting attempts to parse
    # enormous documents.
    def initialize(filename_or_stream, options = {})
      packed = options[:packed]
      file_warning = options.fetch(:file_warning, :error)
      cell_max = options.delete(:cell_max)
      sheet_options = {}
      sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false)
      sheet_options[:no_hyperlinks] = (options[:no_hyperlinks] || false)
      sheet_options[:empty_cell] = (options[:empty_cell] || false)
      shared_options = {}

      shared_options[:disable_html_wrapper] = (options[:disable_html_wrapper] || false)
      unless is_stream?(filename_or_stream)
        file_type_check(filename_or_stream, %w[.xlsx .xlsm], 'an Excel 2007', file_warning, packed)
        basename = find_basename(filename_or_stream)
      end

      # NOTE: Create temp directory and allow Ruby to cleanup the temp directory
      #       when the object is garbage collected. Initially, the finalizer was
      #       created in the Roo::Tempdir module, but that led to a segfault
      #       when testing in Ruby 2.4.0.
      @tmpdir = self.class.make_tempdir(self, basename, options[:tmpdir_root])
      ObjectSpace.define_finalizer(self, self.class.finalize(object_id))

      @shared = Shared.new(@tmpdir, shared_options)
      @filename = local_filename(filename_or_stream, @tmpdir, packed)
      process_zipfile(@filename || filename_or_stream)

      @sheet_names = []
      @sheets = []
      @sheets_by_name = {}

      workbook.sheets.each_with_index do |sheet, index|
        next if options[:only_visible_sheets] && sheet['state'] == 'hidden'

        sheet_name = sheet['name']
        @sheet_names << sheet_name
        @sheets_by_name[sheet_name] = @sheets[index] = Sheet.new(sheet_name, @shared, index, sheet_options)
      end

      if cell_max
        cell_count = ::Roo::Utils.num_cells_in_range(sheet_for(options.delete(:sheet)).dimensions)
        raise ExceedsMaxError.new("Excel file exceeds cell maximum: #{cell_count} > #{cell_max}") if cell_count > cell_max
      end

      super
    rescue
      self.class.finalize_tempdirs(object_id)
      raise
    end

    def method_missing(method, *args)
      if (label = workbook.defined_names[method.to_s])
        safe_send(sheet_for(label.sheet).cells[label.key], :value)
      else
        # call super for methods like #a1
        super
      end
    end

    def sheets
      @sheet_names
    end

    def sheet_for(sheet)
      sheet ||= default_sheet
      validate_sheet!(sheet)
      @sheets_by_name[sheet] || @sheets[sheet]
    end

    def images(sheet = nil)
      images_names = sheet_for(sheet).images.map(&:last)
      images_names.map { |iname| image_files.find { |ifile| ifile[iname] } }
    end

    # Returns the content of a spreadsheet-cell.
    # (1,1) is the upper left corner.
    # (1,1), (1,'A'), ('A',1), ('a',1) all refers to the
    # cell at the first line and first row.
    def cell(row, col, sheet = nil)
      key = normalize(row, col)
      safe_send(sheet_for(sheet).cells[key], :value)
    end

    def row(rownumber, sheet = nil)
      sheet_for(sheet).row(rownumber)
    end

    # returns all values in this column as an array
    # column numbers are 1,2,3,... like in the spreadsheet
    def column(column_number, sheet = nil)
      if column_number.is_a?(::String)
        column_number = ::Roo::Utils.letter_to_number(column_number)
      end
      sheet_for(sheet).column(column_number)
    end

    # returns the number of the first non-empty row
    def first_row(sheet = nil)
      sheet_for(sheet).first_row
    end

    # returns the number of the last non-empty row
    def last_row(sheet = nil)
      sheet_for(sheet).last_row
    end

    # returns the number of the first non-empty column
    def first_column(sheet = nil)
      sheet_for(sheet).first_column
    end

    # returns the number of the last non-empty column
    def last_column(sheet = nil)
      sheet_for(sheet).last_column
    end

    # set a cell to a certain value
    # (this will not be saved back to the spreadsheet file!)
    def set(row, col, value, sheet = nil) #:nodoc:
      key = normalize(row, col)
      cell_type = cell_type_by_value(value)
      sheet_for(sheet).cells[key] = Cell.new(value, cell_type, nil, cell_type, value, nil, nil, nil, Coordinate.new(row, col))
    end

    # Returns the formula at (row,col).
    # Returns nil if there is no formula.
    # The method #formula? checks if there is a formula.
    def formula(row, col, sheet = nil)
      key = normalize(row, col)
      safe_send(sheet_for(sheet).cells[key], :formula)
    end

    # Predicate methods really should return a boolean
    # value. Hopefully no one was relying on the fact that this
    # previously returned either nil/formula
    def formula?(*args)
      !!formula(*args)
    end

    # returns each formula in the selected sheet as an array of tuples in following format
    # [[row, col, formula], [row, col, formula],...]
    def formulas(sheet = nil)
      sheet_for(sheet).cells.select { |_, cell| cell.formula }.map do |(x, y), cell|
        [x, y, cell.formula]
      end
    end

    # Given a cell, return the cell's style
    def font(row, col, sheet = nil)
      key = normalize(row, col)
      definition_index = safe_send(sheet_for(sheet).cells[key], :style)
      styles.definitions[definition_index] if definition_index
    end

    # returns the type of a cell:
    # * :float
    # * :string,
    # * :date
    # * :percentage
    # * :formula
    # * :time
    # * :datetime
    def celltype(row, col, sheet = nil)
      key = normalize(row, col)
      safe_send(sheet_for(sheet).cells[key], :type)
    end

    # returns the internal type of an excel cell
    # * :numeric_or_formula
    # * :string
    # Note: this is only available within the Excelx class
    def excelx_type(row, col, sheet = nil)
      key = normalize(row, col)
      safe_send(sheet_for(sheet).cells[key], :cell_type)
    end

    # returns the internal value of an excelx cell
    # Note: this is only available within the Excelx class
    def excelx_value(row, col, sheet = nil)
      key = normalize(row, col)
      safe_send(sheet_for(sheet).cells[key], :cell_value)
    end

    # returns the internal value of an excelx cell
    # Note: this is only available within the Excelx class
    def formatted_value(row, col, sheet = nil)
      key = normalize(row, col)
      safe_send(sheet_for(sheet).cells[key], :formatted_value)
    end

    # returns the internal format of an excel cell
    def excelx_format(row, col, sheet = nil)
      key = normalize(row, col)
      sheet_for(sheet).excelx_format(key)
    end

    def empty?(row, col, sheet = nil)
      sheet = sheet_for(sheet)
      key = normalize(row, col)
      cell = sheet.cells[key]
      !cell || cell.empty? ||
        (row < sheet.first_row || row > sheet.last_row || col < sheet.first_column || col > sheet.last_column)
    end

    # shows the internal representation of all cells
    # for debugging purposes
    def to_s(sheet = nil)
      sheet_for(sheet).cells.inspect
    end

    # returns the row,col values of the labelled cell
    # (nil,nil) if label is not defined
    def label(name)
      labels = workbook.defined_names
      return [nil, nil, nil] if labels.empty? || !labels.key?(name)

      [labels[name].row, labels[name].col, labels[name].sheet]
    end

    # Returns an array which all labels. Each element is an array with
    # [labelname, [row,col,sheetname]]
    def labels
      @labels ||= workbook.defined_names.map do |name, label|
        [
          name,
          [label.row, label.col, label.sheet]
        ]
      end
    end

    def hyperlink?(row, col, sheet = nil)
      !!hyperlink(row, col, sheet)
    end

    # returns the hyperlink at (row/col)
    # nil if there is no hyperlink
    def hyperlink(row, col, sheet = nil)
      key = normalize(row, col)
      sheet_for(sheet).hyperlinks[key]
    end

    # returns the comment at (row/col)
    # nil if there is no comment
    def comment(row, col, sheet = nil)
      key = normalize(row, col)
      sheet_for(sheet).comments[key]
    end

    # true, if there is a comment
    def comment?(row, col, sheet = nil)
      !!comment(row, col, sheet)
    end

    def comments(sheet = nil)
      sheet_for(sheet).comments.map do |(x, y), comment|
        [x, y, comment]
      end
    end

    # Yield an array of Excelx::Cell
    # Takes options for sheet, pad_cells, and max_rows
    def each_row_streaming(options = {})
      sheet = sheet_for(options.delete(:sheet))
      if block_given?
        sheet.each_row(options) { |row| yield row }
      else
        sheet.to_enum(:each_row, options)
      end
    end

    private

    def clean_sheet(sheet)
      @sheets_by_name[sheet].cells.each_pair do |coord, value|
        next unless value.value.is_a?(::String)

        @sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value)
      end

      @cleaned[sheet] = true
    end

    # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx
    #           documents require a workbook.xml file, so a if the file is missing
    #           it is not a valid xlsx file. In these cases, an ArgumentError is
    #           raised.
    #
    # wb - a Zip::Entry for the workbook.xml file.
    # path - A String for Zip::Entry's destination path.
    #
    # Examples
    #
    #   extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml')
    #   # => ["rId1", "rId2", "rId3"]
    #
    # Returns an Array of Strings.
    def extract_worksheet_ids(entries, path)
      wb = entries.find { |e| e.name[/workbook.xml$/] }
      fail ArgumentError 'missing required workbook file' if wb.nil?

      wb.extract(path)
      workbook_doc = Roo::Utils.load_xml(path).remove_namespaces!
      workbook_doc.xpath('//sheet').map { |s| s['id'] }
    end

    # Internal
    #
    # wb_rels - A Zip::Entry for the workbook.xml.rels file.
    # path - A String for the Zip::Entry's destination path.
    #
    # Examples
    #
    #   extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels')
    #   # => {
    #         "rId1"=>"worksheets/sheet1.xml",
    #         "rId2"=>"worksheets/sheet2.xml",
    #         "rId3"=>"worksheets/sheet3.xml"
    #        }
    #
    # Returns a Hash.
    def extract_worksheet_rels(entries, path)
      wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] }
      fail ArgumentError 'missing required workbook file' if wb_rels.nil?

      wb_rels.extract(path)
      rels_doc = Roo::Utils.load_xml(path).remove_namespaces!

      relationships = rels_doc.xpath('//Relationship').select do |relationship|
        worksheet_types.include? relationship['Type']
      end

      relationships.each_with_object({}) do |relationship, hash|
        hash[relationship['Id']] = relationship['Target']
      end
    end

    # Extracts the sheets in order, but it will ignore sheets that are not
    # worksheets.
    def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
      (sheet_ids & sheets.keys).each_with_index do |id, i|
        name = sheets[id]
        entry = entries.find { |e| "/#{e.name}" =~ /#{name}$/ }
        path = "#{tmpdir}/roo_sheet#{i + 1}"
        sheet_files << path
        @sheet_files << path
        entry.extract(path)
      end
    end

    def extract_images(entries, tmpdir)
      img_entries = entries.select { |e| e.name[/media\/image([0-9]+)/] }
      img_entries.each do |entry|
        path = "#{@tmpdir}/roo#{entry.name.gsub(/xl\/|\//, "_")}"
        image_files << path
        entry.extract(path)
      end
    end

    # Extracts all needed files from the zip file
    def process_zipfile(zipfilename_or_stream)
      @sheet_files = []

      unless is_stream?(zipfilename_or_stream)
        zip_file = Zip::File.open(zipfilename_or_stream)
      else
        zip_file = Zip::CentralDirectory.new
        zip_file.read_from_stream zipfilename_or_stream
      end

      process_zipfile_entries zip_file.to_a.sort_by(&:name)
    end

    def process_zipfile_entries(entries)
      # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames
      #       are not in order. With Numbers 3.1, the first sheet is always
      #       sheet.xml, not sheet1.xml. With Google, the order of the worksheets is
      #       independent of a worksheet's filename (i.e. sheet6.xml can be the
      #       first worksheet).
      #
      #       workbook.xml lists the correct order of worksheets and
      #       workbook.xml.rels lists the filenames for those worksheets.
      #
      #       workbook.xml:
      #         <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/>
      #         <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/>
      #       workbook.xml.rel:
      #         <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
      #         <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
      sheet_ids = extract_worksheet_ids(entries, "#{@tmpdir}/roo_workbook.xml")
      sheets = extract_worksheet_rels(entries, "#{@tmpdir}/roo_workbook.xml.rels")
      extract_sheets_in_order(entries, sheet_ids, sheets, @tmpdir)
      extract_images(entries, @tmpdir)

      entries.each do |entry|
        path =
        case entry.name.downcase
        when /richdata/
          # FIXME: Ignore richData as parsing is not implemented yet and can cause
          # Zip::DestinationFileExistsError when including a second "styles.xml" entry
          # see http://schemas.microsoft.com/office/spreadsheetml/2017/richdata2
          nil
        when /sharedstrings.xml$/
          "#{@tmpdir}/roo_sharedStrings.xml"
        when /styles.xml$/
          "#{@tmpdir}/roo_styles.xml"
        when /comments([0-9]+).xml$/
          # FIXME: Most of the time, The order of the comment files are the same
          #       the sheet order, i.e. sheet1.xml's comments are in comments1.xml.
          #       In some situations, this isn't true. The true location of a
          #       sheet's comment file is in the sheet1.xml.rels file. SEE
          #       ECMA-376 12.3.3 in "Ecma Office Open XML Part 1".
          nr = Regexp.last_match[1].to_i
          comments_files[nr - 1] = "#{@tmpdir}/roo_comments#{nr}"
        when %r{chartsheets/_rels/sheet([0-9]+).xml.rels$}
          # NOTE: Chart sheet relationship files were interfering with
          #       worksheets.
          nil
        when /sheet([0-9]+).xml.rels$/
          # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but
          #        it also stores the location for sharedStrings, comments,
          #        drawings, etc.
          nr = Regexp.last_match[1].to_i
          rels_files[nr - 1] = "#{@tmpdir}/roo_rels#{nr}"
        when /drawing([0-9]+).xml.rels$/
          # Extracting drawing relationships to make images lists for each sheet
          nr = Regexp.last_match[1].to_i
          image_rels[nr - 1] = "#{@tmpdir}/roo_image_rels#{nr}"
        end

        entry.extract(path) if path
      end
    end

    def safe_send(object, method, *args)
      object.send(method, *args) if object&.respond_to?(method)
    end

    def worksheet_types
      [
        'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet', # OOXML Transitional
        'http://purl.oclc.org/ooxml/officeDocument/relationships/worksheet' # OOXML Strict
      ]
    end
  end
end