1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
|
# encoding: utf-8
require 'tmpdir'
require 'stringio'
require 'nokogiri'
require 'roo/utils'
require "roo/formatters/base"
require "roo/formatters/csv"
require "roo/formatters/matrix"
require "roo/formatters/xml"
require "roo/formatters/yaml"
# Base class for all other types of spreadsheets
class Roo::Base
include Enumerable
include Roo::Formatters::Base
include Roo::Formatters::CSV
include Roo::Formatters::Matrix
include Roo::Formatters::XML
include Roo::Formatters::YAML
MAX_ROW_COL = 999_999.freeze
MIN_ROW_COL = 0.freeze
attr_reader :headers
# sets the line with attribute names (default: 1)
attr_accessor :header_line
def self.TEMP_PREFIX
warn '[DEPRECATION] please access TEMP_PREFIX via Roo::TEMP_PREFIX'
Roo::TEMP_PREFIX
end
def self.finalize(object_id)
proc { finalize_tempdirs(object_id) }
end
def initialize(filename, options = {}, _file_warning = :error, _tmpdir = nil)
@filename = filename
@options = options
@cell = {}
@cell_type = {}
@cells_read = {}
@first_row = {}
@last_row = {}
@first_column = {}
@last_column = {}
@header_line = 1
end
def close
if self.class.respond_to?(:finalize_tempdirs)
self.class.finalize_tempdirs(object_id)
end
nil
end
def default_sheet
@default_sheet ||= sheets.first
end
# sets the working sheet in the document
# 'sheet' can be a number (1 = first sheet) or the name of a sheet.
def default_sheet=(sheet)
validate_sheet!(sheet)
@default_sheet = sheet
@first_row[sheet] = @last_row[sheet] = @first_column[sheet] = @last_column[sheet] = nil
@cells_read[sheet] = false
end
# first non-empty column as a letter
def first_column_as_letter(sheet = default_sheet)
::Roo::Utils.number_to_letter(first_column(sheet))
end
# last non-empty column as a letter
def last_column_as_letter(sheet = default_sheet)
::Roo::Utils.number_to_letter(last_column(sheet))
end
# Set first/last row/column for sheet
def first_last_row_col_for_sheet(sheet)
@first_last_row_cols ||= {}
@first_last_row_cols[sheet] ||= begin
result = collect_last_row_col_for_sheet(sheet)
{
first_row: result[:first_row] == MAX_ROW_COL ? nil : result[:first_row],
first_column: result[:first_column] == MAX_ROW_COL ? nil : result[:first_column],
last_row: result[:last_row] == MIN_ROW_COL ? nil : result[:last_row],
last_column: result[:last_column] == MIN_ROW_COL ? nil : result[:last_column]
}
end
end
# Collect first/last row/column from sheet
def collect_last_row_col_for_sheet(sheet)
first_row = first_column = MAX_ROW_COL
last_row = last_column = MIN_ROW_COL
@cell[sheet].each_pair do|key, value|
next unless value
first_row = [first_row, key.first.to_i].min
last_row = [last_row, key.first.to_i].max
first_column = [first_column, key.last.to_i].min
last_column = [last_column, key.last.to_i].max
end if @cell[sheet]
{ first_row: first_row, first_column: first_column, last_row: last_row, last_column: last_column }
end
%w(first_row last_row first_column last_column).each do |key|
class_eval <<-EOS, __FILE__, __LINE__ + 1
def #{key}(sheet = default_sheet) # def first_row(sheet = default_sheet)
read_cells(sheet) # read_cells(sheet)
@#{key}[sheet] ||= first_last_row_col_for_sheet(sheet)[:#{key}] # @first_row[sheet] ||= first_last_row_col_for_sheet(sheet)[:first_row]
end # end
EOS
end
def inspect
"<##{self.class}:#{object_id.to_s(8)} #{instance_variables.join(' ')}>"
end
# find a row either by row number or a condition
# Caution: this works only within the default sheet -> set default_sheet before you call this method
# (experimental. see examples in the test_roo.rb file)
def find(*args) # :nodoc
options = (args.last.is_a?(Hash) ? args.pop : {})
case args[0]
when Integer
find_by_row(args[0])
when :all
find_by_conditions(options)
else
fail ArgumentError, "unexpected arg #{args[0].inspect}, pass a row index or :all"
end
end
# returns all values in this row as an array
# row numbers are 1,2,3,... like in the spreadsheet
def row(row_number, sheet = default_sheet)
read_cells(sheet)
first_column(sheet).upto(last_column(sheet)).map do |col|
cell(row_number, col, sheet)
end
end
# returns all values in this column as an array
# column numbers are 1,2,3,... like in the spreadsheet
def column(column_number, sheet = default_sheet)
if column_number.is_a?(::String)
column_number = ::Roo::Utils.letter_to_number(column_number)
end
read_cells(sheet)
first_row(sheet).upto(last_row(sheet)).map do |row|
cell(row, column_number, sheet)
end
end
# set a cell to a certain value
# (this will not be saved back to the spreadsheet file!)
def set(row, col, value, sheet = default_sheet) #:nodoc:
read_cells(sheet)
row, col = normalize(row, col)
cell_type = cell_type_by_value(value)
set_value(row, col, value, sheet)
set_type(row, col, cell_type, sheet)
end
def cell_type_by_value(value)
case value
when Integer then :float
when String, Float then :string
else
fail ArgumentError, "Type for #{value} not set"
end
end
# reopens and read a spreadsheet document
def reload
ds = default_sheet
reinitialize
self.default_sheet = ds
end
# true if cell is empty
def empty?(row, col, sheet = default_sheet)
read_cells(sheet)
row, col = normalize(row, col)
contents = cell(row, col, sheet)
!contents || (celltype(row, col, sheet) == :string && contents.empty?) \
|| (row < first_row(sheet) || row > last_row(sheet) || col < first_column(sheet) || col > last_column(sheet))
end
# returns information of the spreadsheet document and all sheets within
# this document.
def info
without_changing_default_sheet do
result = "File: #{File.basename(@filename)}\n"\
"Number of sheets: #{sheets.size}\n"\
"Sheets: #{sheets.join(', ')}\n"
n = 1
sheets.each do|sheet|
self.default_sheet = sheet
result << 'Sheet ' + n.to_s + ":\n"
if first_row
result << " First row: #{first_row}\n"
result << " Last row: #{last_row}\n"
result << " First column: #{::Roo::Utils.number_to_letter(first_column)}\n"
result << " Last column: #{::Roo::Utils.number_to_letter(last_column)}"
else
result << ' - empty -'
end
result << "\n" if sheet != sheets.last
n += 1
end
result
end
end
# when a method like spreadsheet.a42 is called
# convert it to a call of spreadsheet.cell('a',42)
def method_missing(m, *args)
# #aa42 => #cell('aa',42)
# #aa42('Sheet1') => #cell('aa',42,'Sheet1')
if m =~ /^([a-z]+)(\d+)$/
col = ::Roo::Utils.letter_to_number(Regexp.last_match[1])
row = Regexp.last_match[2].to_i
if args.empty?
cell(row, col)
else
cell(row, col, args.first)
end
else
super
end
end
# access different worksheets by calling spreadsheet.sheet(1)
# or spreadsheet.sheet('SHEETNAME')
def sheet(index, name = false)
self.default_sheet = index.is_a?(::String) ? index : sheets[index]
name ? [default_sheet, self] : self
end
# iterate through all worksheets of a document
def each_with_pagename
sheets.each do |s|
yield sheet(s, true)
end
end
# by passing in headers as options, this method returns
# specific columns from your header assignment
# for example:
# xls.sheet('New Prices').parse(:upc => 'UPC', :price => 'Price') would return:
# [{:upc => 123456789012, :price => 35.42},..]
# the queries are matched with regex, so regex options can be passed in
# such as :price => '^(Cost|Price)'
# case insensitive by default
# by using the :header_search option, you can query for headers
# and return a hash of every row with the keys set to the header result
# for example:
# xls.sheet('New Prices').parse(:header_search => ['UPC*SKU','^Price*\sCost\s'])
# that example searches for a column titled either UPC or SKU and another
# column titled either Price or Cost (regex characters allowed)
# * is the wildcard character
# you can also pass in a :clean => true option to strip the sheet of
# control characters and white spaces around columns
def each(options = {})
return to_enum(:each, options) unless block_given?
if options.empty?
1.upto(last_row) do |line|
yield row(line)
end
else
clean_sheet_if_need(options)
search_or_set_header(options)
headers = @headers ||
Hash[(first_column..last_column).map do |col|
[cell(@header_line, col), col]
end]
@header_line.upto(last_row) do |line|
yield(Hash[headers.map { |k, v| [k, cell(line, v)] }])
end
end
end
def parse(options = {})
results = each(options).map do |row|
block_given? ? yield(row) : row
end
options[:headers] == true ? results : results.drop(1)
end
def row_with(query, return_headers = false)
line_no = 0
each do |row|
line_no += 1
headers = query.map { |q| row.grep(q)[0] }.compact
if headers.length == query.length
@header_line = line_no
return return_headers ? headers : line_no
elsif line_no > 100
raise Roo::HeaderRowNotFoundError
end
end
raise Roo::HeaderRowNotFoundError
end
protected
def file_type_check(filename, exts, name, warning_level, packed = nil)
if packed == :zip
# spreadsheet.ods.zip => spreadsheet.ods
# Decompression is not performed here, only the 'zip' extension
# is removed from the file.
filename = File.basename(filename, File.extname(filename))
end
if uri?(filename) && (qs_begin = filename.rindex('?'))
filename = filename[0..qs_begin - 1]
end
exts = Array(exts)
return if exts.include?(File.extname(filename).downcase)
case warning_level
when :error
warn file_type_warning_message(filename, exts)
fail TypeError, "#{filename} is not #{name} file"
when :warning
warn "are you sure, this is #{name} spreadsheet file?"
warn file_type_warning_message(filename, exts)
when :ignore
# ignore
else
fail "#{warning_level} illegal state of file_warning"
end
end
# konvertiert einen Key in der Form "12,45" (=row,column) in
# ein Array mit numerischen Werten ([12,45])
# Diese Methode ist eine temp. Loesung, um zu erforschen, ob der
# Zugriff mit numerischen Keys schneller ist.
def key_to_num(str)
r, c = str.split(',')
[r.to_i, c.to_i]
end
# see: key_to_num
def key_to_string(arr)
"#{arr[0]},#{arr[1]}"
end
def is_stream?(filename_or_stream)
filename_or_stream.respond_to?(:seek)
end
private
def clean_sheet_if_need(options)
return unless options[:clean]
options.delete(:clean)
@cleaned ||= {}
clean_sheet(default_sheet) unless @cleaned[default_sheet]
end
def search_or_set_header(options)
if options[:header_search]
@headers = nil
@header_line = row_with(options[:header_search])
elsif [:first_row, true].include?(options[:headers])
@headers = []
row(first_row).each_with_index { |x, i| @headers << [x, i + 1] }
else
set_headers(options)
end
end
def local_filename(filename, tmpdir, packed)
return if is_stream?(filename)
filename = download_uri(filename, tmpdir) if uri?(filename)
filename = unzip(filename, tmpdir) if packed == :zip
fail IOError, "file #{filename} does not exist" unless File.file?(filename)
filename
end
def file_type_warning_message(filename, exts)
*rest, last_ext = exts
ext_list = rest.any? ? "#{rest.join(', ')} or #{last_ext}" : last_ext
"use #{Roo::CLASS_FOR_EXTENSION.fetch(last_ext.sub('.', '').to_sym)}.new to handle #{ext_list} spreadsheet files. This has #{File.extname(filename).downcase}"
rescue KeyError
raise "unknown file types: #{ext_list}"
end
def find_by_row(row_index)
row_index += (header_line - 1) if @header_line
row(row_index).size.times.map do |cell_index|
cell(row_index, cell_index + 1)
end
end
def find_by_conditions(options)
rows = first_row.upto(last_row)
header_for = Hash[1.upto(last_column).map do |col|
[col, cell(@header_line, col)]
end]
# are all conditions met?
conditions = options[:conditions]
if conditions && !conditions.empty?
column_with = header_for.invert
rows = rows.select do |i|
conditions.all? { |key, val| cell(i, column_with[key]) == val }
end
end
if options[:array]
rows.map { |i| row(i) }
else
rows.map do |i|
Hash[1.upto(row(i).size).map do |j|
[header_for.fetch(j), cell(i, j)]
end]
end
end
end
def without_changing_default_sheet
original_default_sheet = default_sheet
yield
ensure
self.default_sheet = original_default_sheet
end
def reinitialize
initialize(@filename)
end
def find_basename(filename)
if uri?(filename)
require 'uri'
uri = URI.parse filename
File.basename(uri.path)
elsif !is_stream?(filename)
File.basename(filename)
end
end
def make_tmpdir(prefix = nil, root = nil, &block)
warn '[DEPRECATION] extend Roo::Tempdir and use its .make_tempdir instead'
prefix = "#{Roo::TEMP_PREFIX}#{prefix}"
root ||= ENV['ROO_TMP']
if block_given?
# folder is deleted at end of block
::Dir.mktmpdir(prefix, root, &block)
else
self.class.make_tempdir(self, prefix, root)
end
end
def clean_sheet(sheet)
read_cells(sheet)
@cell[sheet].each_pair do |coord, value|
@cell[sheet][coord] = sanitize_value(value) if value.is_a?(::String)
end
@cleaned[sheet] = true
end
def sanitize_value(v)
v.gsub(/[[:cntrl:]]|^[\p{Space}]+|[\p{Space}]+$/, '')
end
def set_headers(hash = {})
# try to find header row with all values or give an error
# then create new hash by indexing strings and keeping integers for header array
@headers = row_with(hash.values, true)
@headers = Hash[hash.keys.zip(@headers.map { |x| header_index(x) })]
end
def header_index(query)
row(@header_line).index(query) + first_column
end
def set_value(row, col, value, sheet = default_sheet)
@cell[sheet][[row, col]] = value
end
def set_type(row, col, type, sheet = default_sheet)
@cell_type[sheet][[row, col]] = type
end
# converts cell coordinate to numeric values of row,col
def normalize(row, col)
if row.is_a?(::String)
if col.is_a?(::Integer)
# ('A',1):
# ('B', 5) -> (5, 2)
row, col = col, row
else
fail ArgumentError
end
end
col = ::Roo::Utils.letter_to_number(col) if col.is_a?(::String)
[row, col]
end
def uri?(filename)
filename.start_with?('http://', 'https://', 'ftp://')
rescue
false
end
def download_uri(uri, tmpdir)
require 'open-uri'
tempfilename = File.join(tmpdir, find_basename(uri))
begin
File.open(tempfilename, 'wb') do |file|
open(uri, 'User-Agent' => "Ruby/#{RUBY_VERSION}") do |net|
file.write(net.read)
end
end
rescue OpenURI::HTTPError
raise "could not open #{uri}"
end
tempfilename
end
def open_from_stream(stream, tmpdir)
tempfilename = File.join(tmpdir, 'spreadsheet')
File.open(tempfilename, 'wb') do |file|
file.write(stream[7..-1])
end
File.join(tmpdir, 'spreadsheet')
end
def unzip(filename, tmpdir)
require 'zip/filesystem'
Zip::File.open(filename) do |zip|
process_zipfile_packed(zip, tmpdir)
end
end
# check if default_sheet was set and exists in sheets-array
def validate_sheet!(sheet)
case sheet
when nil
fail ArgumentError, "Error: sheet 'nil' not valid"
when Integer
sheets.fetch(sheet - 1) do
fail RangeError, "sheet index #{sheet} not found"
end
when String
unless sheets.include?(sheet)
fail RangeError, "sheet '#{sheet}' not found"
end
else
fail TypeError, "not a valid sheet type: #{sheet.inspect}"
end
end
def process_zipfile_packed(zip, tmpdir, path = '')
if zip.file.file? path
# extract and return filename
File.open(File.join(tmpdir, path), 'wb') do |file|
file.write(zip.read(path))
end
File.join(tmpdir, path)
else
ret = nil
path += '/' unless path.empty?
zip.dir.foreach(path) do |filename|
ret = process_zipfile_packed(zip, tmpdir, path + filename)
end
ret
end
end
end
|