summaryrefslogtreecommitdiffstats
path: root/lib/roo/base.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/roo/base.rb')
-rw-r--r--lib/roo/base.rb598
1 files changed, 598 insertions, 0 deletions
diff --git a/lib/roo/base.rb b/lib/roo/base.rb
new file mode 100644
index 0000000..53e4075
--- /dev/null
+++ b/lib/roo/base.rb
@@ -0,0 +1,598 @@
+# encoding: utf-8
+
+require 'tmpdir'
+require 'stringio'
+require 'nokogiri'
+require 'roo/utils'
+require "roo/formatters/base"
+require "roo/formatters/csv"
+require "roo/formatters/matrix"
+require "roo/formatters/xml"
+require "roo/formatters/yaml"
+
+# Base class for all other types of spreadsheets
+class Roo::Base
+ include Enumerable
+ include Roo::Formatters::Base
+ include Roo::Formatters::CSV
+ include Roo::Formatters::Matrix
+ include Roo::Formatters::XML
+ include Roo::Formatters::YAML
+
+ MAX_ROW_COL = 999_999.freeze
+ MIN_ROW_COL = 0.freeze
+
+ attr_reader :headers
+
+ # sets the line with attribute names (default: 1)
+ attr_accessor :header_line
+
+ def self.TEMP_PREFIX
+ warn '[DEPRECATION] please access TEMP_PREFIX via Roo::TEMP_PREFIX'
+ Roo::TEMP_PREFIX
+ end
+
+ def self.finalize(object_id)
+ proc { finalize_tempdirs(object_id) }
+ end
+
+ def initialize(filename, options = {}, _file_warning = :error, _tmpdir = nil)
+ @filename = filename
+ @options = options
+
+ @cell = {}
+ @cell_type = {}
+ @cells_read = {}
+
+ @first_row = {}
+ @last_row = {}
+ @first_column = {}
+ @last_column = {}
+
+ @header_line = 1
+ end
+
+ def close
+ if self.class.respond_to?(:finalize_tempdirs)
+ self.class.finalize_tempdirs(object_id)
+ end
+ nil
+ end
+
+ def default_sheet
+ @default_sheet ||= sheets.first
+ end
+
+ # sets the working sheet in the document
+ # 'sheet' can be a number (1 = first sheet) or the name of a sheet.
+ def default_sheet=(sheet)
+ validate_sheet!(sheet)
+ @default_sheet = sheet
+ @first_row[sheet] = @last_row[sheet] = @first_column[sheet] = @last_column[sheet] = nil
+ @cells_read[sheet] = false
+ end
+
+ # first non-empty column as a letter
+ def first_column_as_letter(sheet = default_sheet)
+ ::Roo::Utils.number_to_letter(first_column(sheet))
+ end
+
+ # last non-empty column as a letter
+ def last_column_as_letter(sheet = default_sheet)
+ ::Roo::Utils.number_to_letter(last_column(sheet))
+ end
+
+ # Set first/last row/column for sheet
+ def first_last_row_col_for_sheet(sheet)
+ @first_last_row_cols ||= {}
+ @first_last_row_cols[sheet] ||= begin
+ result = collect_last_row_col_for_sheet(sheet)
+ {
+ first_row: result[:first_row] == MAX_ROW_COL ? nil : result[:first_row],
+ first_column: result[:first_column] == MAX_ROW_COL ? nil : result[:first_column],
+ last_row: result[:last_row] == MIN_ROW_COL ? nil : result[:last_row],
+ last_column: result[:last_column] == MIN_ROW_COL ? nil : result[:last_column]
+ }
+ end
+ end
+
+ # Collect first/last row/column from sheet
+ def collect_last_row_col_for_sheet(sheet)
+ first_row = first_column = MAX_ROW_COL
+ last_row = last_column = MIN_ROW_COL
+ @cell[sheet].each_pair do|key, value|
+ next unless value
+ first_row = [first_row, key.first.to_i].min
+ last_row = [last_row, key.first.to_i].max
+ first_column = [first_column, key.last.to_i].min
+ last_column = [last_column, key.last.to_i].max
+ end if @cell[sheet]
+ { first_row: first_row, first_column: first_column, last_row: last_row, last_column: last_column }
+ end
+
+ %w(first_row last_row first_column last_column).each do |key|
+ class_eval <<-EOS, __FILE__, __LINE__ + 1
+ def #{key}(sheet = default_sheet) # def first_row(sheet = default_sheet)
+ read_cells(sheet) # read_cells(sheet)
+ @#{key}[sheet] ||= first_last_row_col_for_sheet(sheet)[:#{key}] # @first_row[sheet] ||= first_last_row_col_for_sheet(sheet)[:first_row]
+ end # end
+ EOS
+ end
+
+ def inspect
+ "<##{self.class}:#{object_id.to_s(8)} #{instance_variables.join(' ')}>"
+ end
+
+ # find a row either by row number or a condition
+ # Caution: this works only within the default sheet -> set default_sheet before you call this method
+ # (experimental. see examples in the test_roo.rb file)
+ def find(*args) # :nodoc
+ options = (args.last.is_a?(Hash) ? args.pop : {})
+
+ case args[0]
+ when Integer
+ find_by_row(args[0])
+ when :all
+ find_by_conditions(options)
+ else
+ fail ArgumentError, "unexpected arg #{args[0].inspect}, pass a row index or :all"
+ end
+ end
+
+ # returns all values in this row as an array
+ # row numbers are 1,2,3,... like in the spreadsheet
+ def row(row_number, sheet = default_sheet)
+ read_cells(sheet)
+ first_column(sheet).upto(last_column(sheet)).map do |col|
+ cell(row_number, col, sheet)
+ end
+ end
+
+ # returns all values in this column as an array
+ # column numbers are 1,2,3,... like in the spreadsheet
+ def column(column_number, sheet = default_sheet)
+ if column_number.is_a?(::String)
+ column_number = ::Roo::Utils.letter_to_number(column_number)
+ end
+ read_cells(sheet)
+ first_row(sheet).upto(last_row(sheet)).map do |row|
+ cell(row, column_number, sheet)
+ end
+ end
+
+ # set a cell to a certain value
+ # (this will not be saved back to the spreadsheet file!)
+ def set(row, col, value, sheet = default_sheet) #:nodoc:
+ read_cells(sheet)
+ row, col = normalize(row, col)
+ cell_type = cell_type_by_value(value)
+ set_value(row, col, value, sheet)
+ set_type(row, col, cell_type, sheet)
+ end
+
+ def cell_type_by_value(value)
+ case value
+ when Integer then :float
+ when String, Float then :string
+ else
+ fail ArgumentError, "Type for #{value} not set"
+ end
+ end
+
+ # reopens and read a spreadsheet document
+ def reload
+ ds = default_sheet
+ reinitialize
+ self.default_sheet = ds
+ end
+
+ # true if cell is empty
+ def empty?(row, col, sheet = default_sheet)
+ read_cells(sheet)
+ row, col = normalize(row, col)
+ contents = cell(row, col, sheet)
+ !contents || (celltype(row, col, sheet) == :string && contents.empty?) \
+ || (row < first_row(sheet) || row > last_row(sheet) || col < first_column(sheet) || col > last_column(sheet))
+ end
+
+ # returns information of the spreadsheet document and all sheets within
+ # this document.
+ def info
+ without_changing_default_sheet do
+ result = "File: #{File.basename(@filename)}\n"\
+ "Number of sheets: #{sheets.size}\n"\
+ "Sheets: #{sheets.join(', ')}\n"
+ n = 1
+ sheets.each do|sheet|
+ self.default_sheet = sheet
+ result << 'Sheet ' + n.to_s + ":\n"
+ if first_row
+ result << " First row: #{first_row}\n"
+ result << " Last row: #{last_row}\n"
+ result << " First column: #{::Roo::Utils.number_to_letter(first_column)}\n"
+ result << " Last column: #{::Roo::Utils.number_to_letter(last_column)}"
+ else
+ result << ' - empty -'
+ end
+ result << "\n" if sheet != sheets.last
+ n += 1
+ end
+ result
+ end
+ end
+
+ # when a method like spreadsheet.a42 is called
+ # convert it to a call of spreadsheet.cell('a',42)
+ def method_missing(m, *args)
+ # #aa42 => #cell('aa',42)
+ # #aa42('Sheet1') => #cell('aa',42,'Sheet1')
+ if m =~ /^([a-z]+)(\d+)$/
+ col = ::Roo::Utils.letter_to_number(Regexp.last_match[1])
+ row = Regexp.last_match[2].to_i
+ if args.empty?
+ cell(row, col)
+ else
+ cell(row, col, args.first)
+ end
+ else
+ super
+ end
+ end
+
+ # access different worksheets by calling spreadsheet.sheet(1)
+ # or spreadsheet.sheet('SHEETNAME')
+ def sheet(index, name = false)
+ self.default_sheet = index.is_a?(::String) ? index : sheets[index]
+ name ? [default_sheet, self] : self
+ end
+
+ # iterate through all worksheets of a document
+ def each_with_pagename
+ sheets.each do |s|
+ yield sheet(s, true)
+ end
+ end
+
+ # by passing in headers as options, this method returns
+ # specific columns from your header assignment
+ # for example:
+ # xls.sheet('New Prices').parse(:upc => 'UPC', :price => 'Price') would return:
+ # [{:upc => 123456789012, :price => 35.42},..]
+
+ # the queries are matched with regex, so regex options can be passed in
+ # such as :price => '^(Cost|Price)'
+ # case insensitive by default
+
+ # by using the :header_search option, you can query for headers
+ # and return a hash of every row with the keys set to the header result
+ # for example:
+ # xls.sheet('New Prices').parse(:header_search => ['UPC*SKU','^Price*\sCost\s'])
+
+ # that example searches for a column titled either UPC or SKU and another
+ # column titled either Price or Cost (regex characters allowed)
+ # * is the wildcard character
+
+ # you can also pass in a :clean => true option to strip the sheet of
+ # control characters and white spaces around columns
+
+ def each(options = {})
+ return to_enum(:each, options) unless block_given?
+
+ if options.empty?
+ 1.upto(last_row) do |line|
+ yield row(line)
+ end
+ else
+ clean_sheet_if_need(options)
+ search_or_set_header(options)
+ headers = @headers ||
+ Hash[(first_column..last_column).map do |col|
+ [cell(@header_line, col), col]
+ end]
+
+ @header_line.upto(last_row) do |line|
+ yield(Hash[headers.map { |k, v| [k, cell(line, v)] }])
+ end
+ end
+ end
+
+ def parse(options = {})
+ results = each(options).map do |row|
+ block_given? ? yield(row) : row
+ end
+
+ options[:headers] == true ? results : results.drop(1)
+ end
+
+ def row_with(query, return_headers = false)
+ line_no = 0
+ each do |row|
+ line_no += 1
+ headers = query.map { |q| row.grep(q)[0] }.compact
+
+ if headers.length == query.length
+ @header_line = line_no
+ return return_headers ? headers : line_no
+ elsif line_no > 100
+ raise Roo::HeaderRowNotFoundError
+ end
+ end
+ raise Roo::HeaderRowNotFoundError
+ end
+
+ protected
+
+ def file_type_check(filename, exts, name, warning_level, packed = nil)
+ if packed == :zip
+ # spreadsheet.ods.zip => spreadsheet.ods
+ # Decompression is not performed here, only the 'zip' extension
+ # is removed from the file.
+ filename = File.basename(filename, File.extname(filename))
+ end
+
+ if uri?(filename) && (qs_begin = filename.rindex('?'))
+ filename = filename[0..qs_begin - 1]
+ end
+ exts = Array(exts)
+
+ return if exts.include?(File.extname(filename).downcase)
+
+ case warning_level
+ when :error
+ warn file_type_warning_message(filename, exts)
+ fail TypeError, "#{filename} is not #{name} file"
+ when :warning
+ warn "are you sure, this is #{name} spreadsheet file?"
+ warn file_type_warning_message(filename, exts)
+ when :ignore
+ # ignore
+ else
+ fail "#{warning_level} illegal state of file_warning"
+ end
+ end
+
+ # konvertiert einen Key in der Form "12,45" (=row,column) in
+ # ein Array mit numerischen Werten ([12,45])
+ # Diese Methode ist eine temp. Loesung, um zu erforschen, ob der
+ # Zugriff mit numerischen Keys schneller ist.
+ def key_to_num(str)
+ r, c = str.split(',')
+ [r.to_i, c.to_i]
+ end
+
+ # see: key_to_num
+ def key_to_string(arr)
+ "#{arr[0]},#{arr[1]}"
+ end
+
+ def is_stream?(filename_or_stream)
+ filename_or_stream.respond_to?(:seek)
+ end
+
+ private
+
+ def clean_sheet_if_need(options)
+ return unless options[:clean]
+ options.delete(:clean)
+ @cleaned ||= {}
+ clean_sheet(default_sheet) unless @cleaned[default_sheet]
+ end
+
+ def search_or_set_header(options)
+ if options[:header_search]
+ @headers = nil
+ @header_line = row_with(options[:header_search])
+ elsif [:first_row, true].include?(options[:headers])
+ @headers = []
+ row(first_row).each_with_index { |x, i| @headers << [x, i + 1] }
+ else
+ set_headers(options)
+ end
+ end
+
+ def local_filename(filename, tmpdir, packed)
+ return if is_stream?(filename)
+ filename = download_uri(filename, tmpdir) if uri?(filename)
+ filename = unzip(filename, tmpdir) if packed == :zip
+
+ fail IOError, "file #{filename} does not exist" unless File.file?(filename)
+
+ filename
+ end
+
+ def file_type_warning_message(filename, exts)
+ *rest, last_ext = exts
+ ext_list = rest.any? ? "#{rest.join(', ')} or #{last_ext}" : last_ext
+ "use #{Roo::CLASS_FOR_EXTENSION.fetch(last_ext.sub('.', '').to_sym)}.new to handle #{ext_list} spreadsheet files. This has #{File.extname(filename).downcase}"
+ rescue KeyError
+ raise "unknown file types: #{ext_list}"
+ end
+
+ def find_by_row(row_index)
+ row_index += (header_line - 1) if @header_line
+
+ row(row_index).size.times.map do |cell_index|
+ cell(row_index, cell_index + 1)
+ end
+ end
+
+ def find_by_conditions(options)
+ rows = first_row.upto(last_row)
+ header_for = Hash[1.upto(last_column).map do |col|
+ [col, cell(@header_line, col)]
+ end]
+
+ # are all conditions met?
+ conditions = options[:conditions]
+ if conditions && !conditions.empty?
+ column_with = header_for.invert
+ rows = rows.select do |i|
+ conditions.all? { |key, val| cell(i, column_with[key]) == val }
+ end
+ end
+
+ if options[:array]
+ rows.map { |i| row(i) }
+ else
+ rows.map do |i|
+ Hash[1.upto(row(i).size).map do |j|
+ [header_for.fetch(j), cell(i, j)]
+ end]
+ end
+ end
+ end
+
+ def without_changing_default_sheet
+ original_default_sheet = default_sheet
+ yield
+ ensure
+ self.default_sheet = original_default_sheet
+ end
+
+ def reinitialize
+ initialize(@filename)
+ end
+
+ def find_basename(filename)
+ if uri?(filename)
+ require 'uri'
+ uri = URI.parse filename
+ File.basename(uri.path)
+ elsif !is_stream?(filename)
+ File.basename(filename)
+ end
+ end
+
+ def make_tmpdir(prefix = nil, root = nil, &block)
+ warn '[DEPRECATION] extend Roo::Tempdir and use its .make_tempdir instead'
+ prefix = "#{Roo::TEMP_PREFIX}#{prefix}"
+ root ||= ENV['ROO_TMP']
+
+ if block_given?
+ # folder is deleted at end of block
+ ::Dir.mktmpdir(prefix, root, &block)
+ else
+ self.class.make_tempdir(self, prefix, root)
+ end
+ end
+
+ def clean_sheet(sheet)
+ read_cells(sheet)
+ @cell[sheet].each_pair do |coord, value|
+ @cell[sheet][coord] = sanitize_value(value) if value.is_a?(::String)
+ end
+ @cleaned[sheet] = true
+ end
+
+ def sanitize_value(v)
+ v.gsub(/[[:cntrl:]]|^[\p{Space}]+|[\p{Space}]+$/, '')
+ end
+
+ def set_headers(hash = {})
+ # try to find header row with all values or give an error
+ # then create new hash by indexing strings and keeping integers for header array
+ @headers = row_with(hash.values, true)
+ @headers = Hash[hash.keys.zip(@headers.map { |x| header_index(x) })]
+ end
+
+ def header_index(query)
+ row(@header_line).index(query) + first_column
+ end
+
+ def set_value(row, col, value, sheet = default_sheet)
+ @cell[sheet][[row, col]] = value
+ end
+
+ def set_type(row, col, type, sheet = default_sheet)
+ @cell_type[sheet][[row, col]] = type
+ end
+
+ # converts cell coordinate to numeric values of row,col
+ def normalize(row, col)
+ if row.is_a?(::String)
+ if col.is_a?(::Integer)
+ # ('A',1):
+ # ('B', 5) -> (5, 2)
+ row, col = col, row
+ else
+ fail ArgumentError
+ end
+ end
+
+ col = ::Roo::Utils.letter_to_number(col) if col.is_a?(::String)
+
+ [row, col]
+ end
+
+ def uri?(filename)
+ filename.start_with?('http://', 'https://', 'ftp://')
+ rescue
+ false
+ end
+
+ def download_uri(uri, tmpdir)
+ require 'open-uri'
+ tempfilename = File.join(tmpdir, find_basename(uri))
+ begin
+ File.open(tempfilename, 'wb') do |file|
+ open(uri, 'User-Agent' => "Ruby/#{RUBY_VERSION}") do |net|
+ file.write(net.read)
+ end
+ end
+ rescue OpenURI::HTTPError
+ raise "could not open #{uri}"
+ end
+ tempfilename
+ end
+
+ def open_from_stream(stream, tmpdir)
+ tempfilename = File.join(tmpdir, 'spreadsheet')
+ File.open(tempfilename, 'wb') do |file|
+ file.write(stream[7..-1])
+ end
+ File.join(tmpdir, 'spreadsheet')
+ end
+
+ def unzip(filename, tmpdir)
+ require 'zip/filesystem'
+
+ Zip::File.open(filename) do |zip|
+ process_zipfile_packed(zip, tmpdir)
+ end
+ end
+
+ # check if default_sheet was set and exists in sheets-array
+ def validate_sheet!(sheet)
+ case sheet
+ when nil
+ fail ArgumentError, "Error: sheet 'nil' not valid"
+ when Integer
+ sheets.fetch(sheet - 1) do
+ fail RangeError, "sheet index #{sheet} not found"
+ end
+ when String
+ unless sheets.include?(sheet)
+ fail RangeError, "sheet '#{sheet}' not found"
+ end
+ else
+ fail TypeError, "not a valid sheet type: #{sheet.inspect}"
+ end
+ end
+
+ def process_zipfile_packed(zip, tmpdir, path = '')
+ if zip.file.file? path
+ # extract and return filename
+ File.open(File.join(tmpdir, path), 'wb') do |file|
+ file.write(zip.read(path))
+ end
+ File.join(tmpdir, path)
+ else
+ ret = nil
+ path += '/' unless path.empty?
+ zip.dir.foreach(path) do |filename|
+ ret = process_zipfile_packed(zip, tmpdir, path + filename)
+ end
+ ret
+ end
+ end
+end