1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
|
require 'nokogiri'
require 'zip/filesystem'
require 'roo/link'
require 'roo/tempdir'
require 'roo/utils'
require 'forwardable'
require 'set'
module Roo
class Excelx < Roo::Base
extend Roo::Tempdir
extend Forwardable
ERROR_VALUES = %w(#N/A #REF! #NAME? #DIV/0! #NULL! #VALUE! #NUM!).to_set
require 'roo/excelx/shared'
require 'roo/excelx/workbook'
require 'roo/excelx/shared_strings'
require 'roo/excelx/styles'
require 'roo/excelx/cell'
require 'roo/excelx/sheet'
require 'roo/excelx/relationships'
require 'roo/excelx/comments'
require 'roo/excelx/sheet_doc'
require 'roo/excelx/coordinate'
require 'roo/excelx/format'
require 'roo/excelx/images'
delegate [:styles, :workbook, :shared_strings, :rels_files, :sheet_files, :comments_files, :image_rels, :image_files] => :@shared
ExceedsMaxError = Class.new(StandardError)
# initialization and opening of a spreadsheet file
# values for packed: :zip
# optional cell_max (int) parameter for early aborting attempts to parse
# enormous documents.
def initialize(filename_or_stream, options = {})
packed = options[:packed]
file_warning = options.fetch(:file_warning, :error)
cell_max = options.delete(:cell_max)
sheet_options = {}
sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false)
sheet_options[:no_hyperlinks] = (options[:no_hyperlinks] || false)
sheet_options[:empty_cell] = (options[:empty_cell] || false)
shared_options = {}
shared_options[:disable_html_wrapper] = (options[:disable_html_wrapper] || false)
unless is_stream?(filename_or_stream)
file_type_check(filename_or_stream, %w[.xlsx .xlsm], 'an Excel 2007', file_warning, packed)
basename = find_basename(filename_or_stream)
end
# NOTE: Create temp directory and allow Ruby to cleanup the temp directory
# when the object is garbage collected. Initially, the finalizer was
# created in the Roo::Tempdir module, but that led to a segfault
# when testing in Ruby 2.4.0.
@tmpdir = self.class.make_tempdir(self, basename, options[:tmpdir_root])
ObjectSpace.define_finalizer(self, self.class.finalize(object_id))
@shared = Shared.new(@tmpdir, shared_options)
@filename = local_filename(filename_or_stream, @tmpdir, packed)
process_zipfile(@filename || filename_or_stream)
@sheet_names = []
@sheets = []
@sheets_by_name = {}
workbook.sheets.each_with_index do |sheet, index|
next if options[:only_visible_sheets] && sheet['state'] == 'hidden'
sheet_name = sheet['name']
@sheet_names << sheet_name
@sheets_by_name[sheet_name] = @sheets[index] = Sheet.new(sheet_name, @shared, index, sheet_options)
end
if cell_max
cell_count = ::Roo::Utils.num_cells_in_range(sheet_for(options.delete(:sheet)).dimensions)
raise ExceedsMaxError.new("Excel file exceeds cell maximum: #{cell_count} > #{cell_max}") if cell_count > cell_max
end
super
rescue
self.class.finalize_tempdirs(object_id)
raise
end
def method_missing(method, *args)
if (label = workbook.defined_names[method.to_s])
safe_send(sheet_for(label.sheet).cells[label.key], :value)
else
# call super for methods like #a1
super
end
end
def sheets
@sheet_names
end
def sheet_for(sheet)
sheet ||= default_sheet
validate_sheet!(sheet)
@sheets_by_name[sheet] || @sheets[sheet]
end
def images(sheet = nil)
images_names = sheet_for(sheet).images.map(&:last)
images_names.map { |iname| image_files.find { |ifile| ifile[iname] } }
end
# Returns the content of a spreadsheet-cell.
# (1,1) is the upper left corner.
# (1,1), (1,'A'), ('A',1), ('a',1) all refers to the
# cell at the first line and first row.
def cell(row, col, sheet = nil)
key = normalize(row, col)
safe_send(sheet_for(sheet).cells[key], :value)
end
def row(rownumber, sheet = nil)
sheet_for(sheet).row(rownumber)
end
# returns all values in this column as an array
# column numbers are 1,2,3,... like in the spreadsheet
def column(column_number, sheet = nil)
if column_number.is_a?(::String)
column_number = ::Roo::Utils.letter_to_number(column_number)
end
sheet_for(sheet).column(column_number)
end
# returns the number of the first non-empty row
def first_row(sheet = nil)
sheet_for(sheet).first_row
end
# returns the number of the last non-empty row
def last_row(sheet = nil)
sheet_for(sheet).last_row
end
# returns the number of the first non-empty column
def first_column(sheet = nil)
sheet_for(sheet).first_column
end
# returns the number of the last non-empty column
def last_column(sheet = nil)
sheet_for(sheet).last_column
end
# set a cell to a certain value
# (this will not be saved back to the spreadsheet file!)
def set(row, col, value, sheet = nil) #:nodoc:
key = normalize(row, col)
cell_type = cell_type_by_value(value)
sheet_for(sheet).cells[key] = Cell.new(value, cell_type, nil, cell_type, value, nil, nil, nil, Coordinate.new(row, col))
end
# Returns the formula at (row,col).
# Returns nil if there is no formula.
# The method #formula? checks if there is a formula.
def formula(row, col, sheet = nil)
key = normalize(row, col)
safe_send(sheet_for(sheet).cells[key], :formula)
end
# Predicate methods really should return a boolean
# value. Hopefully no one was relying on the fact that this
# previously returned either nil/formula
def formula?(*args)
!!formula(*args)
end
# returns each formula in the selected sheet as an array of tuples in following format
# [[row, col, formula], [row, col, formula],...]
def formulas(sheet = nil)
sheet_for(sheet).cells.select { |_, cell| cell.formula }.map do |(x, y), cell|
[x, y, cell.formula]
end
end
# Given a cell, return the cell's style
def font(row, col, sheet = nil)
key = normalize(row, col)
definition_index = safe_send(sheet_for(sheet).cells[key], :style)
styles.definitions[definition_index] if definition_index
end
# returns the type of a cell:
# * :float
# * :string,
# * :date
# * :percentage
# * :formula
# * :time
# * :datetime
def celltype(row, col, sheet = nil)
key = normalize(row, col)
safe_send(sheet_for(sheet).cells[key], :type)
end
# returns the internal type of an excel cell
# * :numeric_or_formula
# * :string
# Note: this is only available within the Excelx class
def excelx_type(row, col, sheet = nil)
key = normalize(row, col)
safe_send(sheet_for(sheet).cells[key], :cell_type)
end
# returns the internal value of an excelx cell
# Note: this is only available within the Excelx class
def excelx_value(row, col, sheet = nil)
key = normalize(row, col)
safe_send(sheet_for(sheet).cells[key], :cell_value)
end
# returns the internal value of an excelx cell
# Note: this is only available within the Excelx class
def formatted_value(row, col, sheet = nil)
key = normalize(row, col)
safe_send(sheet_for(sheet).cells[key], :formatted_value)
end
# returns the internal format of an excel cell
def excelx_format(row, col, sheet = nil)
key = normalize(row, col)
sheet_for(sheet).excelx_format(key)
end
def empty?(row, col, sheet = nil)
sheet = sheet_for(sheet)
key = normalize(row, col)
cell = sheet.cells[key]
!cell || cell.empty? ||
(row < sheet.first_row || row > sheet.last_row || col < sheet.first_column || col > sheet.last_column)
end
# shows the internal representation of all cells
# for debugging purposes
def to_s(sheet = nil)
sheet_for(sheet).cells.inspect
end
# returns the row,col values of the labelled cell
# (nil,nil) if label is not defined
def label(name)
labels = workbook.defined_names
return [nil, nil, nil] if labels.empty? || !labels.key?(name)
[labels[name].row, labels[name].col, labels[name].sheet]
end
# Returns an array which all labels. Each element is an array with
# [labelname, [row,col,sheetname]]
def labels
@labels ||= workbook.defined_names.map do |name, label|
[
name,
[label.row, label.col, label.sheet]
]
end
end
def hyperlink?(row, col, sheet = nil)
!!hyperlink(row, col, sheet)
end
# returns the hyperlink at (row/col)
# nil if there is no hyperlink
def hyperlink(row, col, sheet = nil)
key = normalize(row, col)
sheet_for(sheet).hyperlinks[key]
end
# returns the comment at (row/col)
# nil if there is no comment
def comment(row, col, sheet = nil)
key = normalize(row, col)
sheet_for(sheet).comments[key]
end
# true, if there is a comment
def comment?(row, col, sheet = nil)
!!comment(row, col, sheet)
end
def comments(sheet = nil)
sheet_for(sheet).comments.map do |(x, y), comment|
[x, y, comment]
end
end
# Yield an array of Excelx::Cell
# Takes options for sheet, pad_cells, and max_rows
def each_row_streaming(options = {})
sheet = sheet_for(options.delete(:sheet))
if block_given?
sheet.each_row(options) { |row| yield row }
else
sheet.to_enum(:each_row, options)
end
end
private
def clean_sheet(sheet)
@sheets_by_name[sheet].cells.each_pair do |coord, value|
next unless value.value.is_a?(::String)
@sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value)
end
@cleaned[sheet] = true
end
# Internal: extracts the worksheet_ids from the workbook.xml file. xlsx
# documents require a workbook.xml file, so a if the file is missing
# it is not a valid xlsx file. In these cases, an ArgumentError is
# raised.
#
# wb - a Zip::Entry for the workbook.xml file.
# path - A String for Zip::Entry's destination path.
#
# Examples
#
# extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml')
# # => ["rId1", "rId2", "rId3"]
#
# Returns an Array of Strings.
def extract_worksheet_ids(entries, path)
wb = entries.find { |e| e.name[/workbook.xml$/] }
fail ArgumentError 'missing required workbook file' if wb.nil?
wb.extract(File.basename(path), destination_directory: File.dirname(path))
workbook_doc = Roo::Utils.load_xml(path).remove_namespaces!
workbook_doc.xpath('//sheet').map { |s| s['id'] }
end
# Internal
#
# wb_rels - A Zip::Entry for the workbook.xml.rels file.
# path - A String for the Zip::Entry's destination path.
#
# Examples
#
# extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels')
# # => {
# "rId1"=>"worksheets/sheet1.xml",
# "rId2"=>"worksheets/sheet2.xml",
# "rId3"=>"worksheets/sheet3.xml"
# }
#
# Returns a Hash.
def extract_worksheet_rels(entries, path)
wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] }
fail ArgumentError 'missing required workbook file' if wb_rels.nil?
wb_rels.extract(File.basename(path), destination_directory: File.dirname(path))
rels_doc = Roo::Utils.load_xml(path).remove_namespaces!
relationships = rels_doc.xpath('//Relationship').select do |relationship|
worksheet_types.include? relationship['Type']
end
relationships.each_with_object({}) do |relationship, hash|
hash[relationship['Id']] = relationship['Target']
end
end
# Extracts the sheets in order, but it will ignore sheets that are not
# worksheets.
def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
(sheet_ids & sheets.keys).each_with_index do |id, i|
name = sheets[id]
entry = entries.find { |e| "/#{e.name}" =~ /#{name}$/ }
path = "#{tmpdir}/roo_sheet#{i + 1}"
sheet_files << path
@sheet_files << path
entry.extract(File.basename(path), destination_directory: File.dirname(path))
end
end
def extract_images(entries, tmpdir)
img_entries = entries.select { |e| e.name[/media\/image([0-9]+)/] }
img_entries.each do |entry|
path = "#{@tmpdir}/roo#{entry.name.gsub(/xl\/|\//, "_")}"
image_files << path
entry.extract(File.basename(path), destination_directory: File.dirname(path))
end
end
# Extracts all needed files from the zip file
def process_zipfile(zipfilename_or_stream)
@sheet_files = []
unless is_stream?(zipfilename_or_stream)
zip_file = Zip::File.open(zipfilename_or_stream)
else
zip_file = Zip::CentralDirectory.new
zip_file.read_from_stream zipfilename_or_stream
end
process_zipfile_entries zip_file.entries.sort_by(&:name)
end
def process_zipfile_entries(entries)
# NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames
# are not in order. With Numbers 3.1, the first sheet is always
# sheet.xml, not sheet1.xml. With Google, the order of the worksheets is
# independent of a worksheet's filename (i.e. sheet6.xml can be the
# first worksheet).
#
# workbook.xml lists the correct order of worksheets and
# workbook.xml.rels lists the filenames for those worksheets.
#
# workbook.xml:
# <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/>
# <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/>
# workbook.xml.rel:
# <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
# <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
sheet_ids = extract_worksheet_ids(entries, "#{@tmpdir}/roo_workbook.xml")
sheets = extract_worksheet_rels(entries, "#{@tmpdir}/roo_workbook.xml.rels")
extract_sheets_in_order(entries, sheet_ids, sheets, @tmpdir)
extract_images(entries, @tmpdir)
entries.each do |entry|
path =
case entry.name.downcase
when /richdata/
# FIXME: Ignore richData as parsing is not implemented yet and can cause
# Zip::DestinationFileExistsError when including a second "styles.xml" entry
# see http://schemas.microsoft.com/office/spreadsheetml/2017/richdata2
nil
when /sharedstrings.xml$/
"#{@tmpdir}/roo_sharedStrings.xml"
when /styles.xml$/
"#{@tmpdir}/roo_styles.xml"
when /comments([0-9]+).xml$/
# FIXME: Most of the time, The order of the comment files are the same
# the sheet order, i.e. sheet1.xml's comments are in comments1.xml.
# In some situations, this isn't true. The true location of a
# sheet's comment file is in the sheet1.xml.rels file. SEE
# ECMA-376 12.3.3 in "Ecma Office Open XML Part 1".
nr = Regexp.last_match[1].to_i
comments_files[nr - 1] = "#{@tmpdir}/roo_comments#{nr}"
when %r{chartsheets/_rels/sheet([0-9]+).xml.rels$}
# NOTE: Chart sheet relationship files were interfering with
# worksheets.
nil
when /sheet([0-9]+).xml.rels$/
# FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but
# it also stores the location for sharedStrings, comments,
# drawings, etc.
nr = Regexp.last_match[1].to_i
rels_files[nr - 1] = "#{@tmpdir}/roo_rels#{nr}"
when /drawing([0-9]+).xml.rels$/
# Extracting drawing relationships to make images lists for each sheet
nr = Regexp.last_match[1].to_i
image_rels[nr - 1] = "#{@tmpdir}/roo_image_rels#{nr}"
end
entry.extract(File.basename(path), destination_directory: File.dirname(path)) if path
end
end
def safe_send(object, method, *args)
object.send(method, *args) if object&.respond_to?(method)
end
def worksheet_types
[
'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet', # OOXML Transitional
'http://purl.oclc.org/ooxml/officeDocument/relationships/worksheet' # OOXML Strict
]
end
end
end
|