diff options
| author | 2017-06-12 03:37:11 -0400 | |
|---|---|---|
| committer | 2017-06-12 03:37:11 -0400 | |
| commit | 8280a21a23d44aa90177e2bc041d0b8dc8556f4b (patch) | |
| tree | dadef7ee085c0e990a5070bd41b6a5b98c97f4fd /lib | |
Import Upstream version 2.7.1upstream/2.7.1
Diffstat (limited to 'lib')
39 files changed, 3577 insertions, 0 deletions
diff --git a/lib/roo.rb b/lib/roo.rb new file mode 100644 index 0000000..ec8eb42 --- /dev/null +++ b/lib/roo.rb @@ -0,0 +1,33 @@ +require 'roo/constants' +require 'roo/errors' +require 'roo/spreadsheet' +require 'roo/base' + +module Roo + autoload :OpenOffice, 'roo/open_office' + autoload :LibreOffice, 'roo/libre_office' + autoload :Excelx, 'roo/excelx' + autoload :CSV, 'roo/csv' + + TEMP_PREFIX = 'roo_'.freeze + + CLASS_FOR_EXTENSION = { + ods: Roo::OpenOffice, + xlsx: Roo::Excelx, + xlsm: Roo::Excelx, + csv: Roo::CSV + } + + def self.const_missing(const_name) + case const_name + when :Excel + raise ROO_EXCEL_NOTICE + when :Excel2003XML + raise ROO_EXCELML_NOTICE + when :Google + raise ROO_GOOGLE_NOTICE + else + super + end + end +end diff --git a/lib/roo/base.rb b/lib/roo/base.rb new file mode 100644 index 0000000..53e4075 --- /dev/null +++ b/lib/roo/base.rb @@ -0,0 +1,598 @@ +# encoding: utf-8 + +require 'tmpdir' +require 'stringio' +require 'nokogiri' +require 'roo/utils' +require "roo/formatters/base" +require "roo/formatters/csv" +require "roo/formatters/matrix" +require "roo/formatters/xml" +require "roo/formatters/yaml" + +# Base class for all other types of spreadsheets +class Roo::Base + include Enumerable + include Roo::Formatters::Base + include Roo::Formatters::CSV + include Roo::Formatters::Matrix + include Roo::Formatters::XML + include Roo::Formatters::YAML + + MAX_ROW_COL = 999_999.freeze + MIN_ROW_COL = 0.freeze + + attr_reader :headers + + # sets the line with attribute names (default: 1) + attr_accessor :header_line + + def self.TEMP_PREFIX + warn '[DEPRECATION] please access TEMP_PREFIX via Roo::TEMP_PREFIX' + Roo::TEMP_PREFIX + end + + def self.finalize(object_id) + proc { finalize_tempdirs(object_id) } + end + + def initialize(filename, options = {}, _file_warning = :error, _tmpdir = nil) + @filename = filename + @options = options + + @cell = {} + @cell_type = {} + @cells_read = {} + + @first_row = {} + @last_row = {} + @first_column = {} + @last_column = {} + + @header_line = 1 + end + + def close + if self.class.respond_to?(:finalize_tempdirs) + self.class.finalize_tempdirs(object_id) + end + nil + end + + def default_sheet + @default_sheet ||= sheets.first + end + + # sets the working sheet in the document + # 'sheet' can be a number (1 = first sheet) or the name of a sheet. + def default_sheet=(sheet) + validate_sheet!(sheet) + @default_sheet = sheet + @first_row[sheet] = @last_row[sheet] = @first_column[sheet] = @last_column[sheet] = nil + @cells_read[sheet] = false + end + + # first non-empty column as a letter + def first_column_as_letter(sheet = default_sheet) + ::Roo::Utils.number_to_letter(first_column(sheet)) + end + + # last non-empty column as a letter + def last_column_as_letter(sheet = default_sheet) + ::Roo::Utils.number_to_letter(last_column(sheet)) + end + + # Set first/last row/column for sheet + def first_last_row_col_for_sheet(sheet) + @first_last_row_cols ||= {} + @first_last_row_cols[sheet] ||= begin + result = collect_last_row_col_for_sheet(sheet) + { + first_row: result[:first_row] == MAX_ROW_COL ? nil : result[:first_row], + first_column: result[:first_column] == MAX_ROW_COL ? nil : result[:first_column], + last_row: result[:last_row] == MIN_ROW_COL ? nil : result[:last_row], + last_column: result[:last_column] == MIN_ROW_COL ? nil : result[:last_column] + } + end + end + + # Collect first/last row/column from sheet + def collect_last_row_col_for_sheet(sheet) + first_row = first_column = MAX_ROW_COL + last_row = last_column = MIN_ROW_COL + @cell[sheet].each_pair do|key, value| + next unless value + first_row = [first_row, key.first.to_i].min + last_row = [last_row, key.first.to_i].max + first_column = [first_column, key.last.to_i].min + last_column = [last_column, key.last.to_i].max + end if @cell[sheet] + { first_row: first_row, first_column: first_column, last_row: last_row, last_column: last_column } + end + + %w(first_row last_row first_column last_column).each do |key| + class_eval <<-EOS, __FILE__, __LINE__ + 1 + def #{key}(sheet = default_sheet) # def first_row(sheet = default_sheet) + read_cells(sheet) # read_cells(sheet) + @#{key}[sheet] ||= first_last_row_col_for_sheet(sheet)[:#{key}] # @first_row[sheet] ||= first_last_row_col_for_sheet(sheet)[:first_row] + end # end + EOS + end + + def inspect + "<##{self.class}:#{object_id.to_s(8)} #{instance_variables.join(' ')}>" + end + + # find a row either by row number or a condition + # Caution: this works only within the default sheet -> set default_sheet before you call this method + # (experimental. see examples in the test_roo.rb file) + def find(*args) # :nodoc + options = (args.last.is_a?(Hash) ? args.pop : {}) + + case args[0] + when Integer + find_by_row(args[0]) + when :all + find_by_conditions(options) + else + fail ArgumentError, "unexpected arg #{args[0].inspect}, pass a row index or :all" + end + end + + # returns all values in this row as an array + # row numbers are 1,2,3,... like in the spreadsheet + def row(row_number, sheet = default_sheet) + read_cells(sheet) + first_column(sheet).upto(last_column(sheet)).map do |col| + cell(row_number, col, sheet) + end + end + + # returns all values in this column as an array + # column numbers are 1,2,3,... like in the spreadsheet + def column(column_number, sheet = default_sheet) + if column_number.is_a?(::String) + column_number = ::Roo::Utils.letter_to_number(column_number) + end + read_cells(sheet) + first_row(sheet).upto(last_row(sheet)).map do |row| + cell(row, column_number, sheet) + end + end + + # set a cell to a certain value + # (this will not be saved back to the spreadsheet file!) + def set(row, col, value, sheet = default_sheet) #:nodoc: + read_cells(sheet) + row, col = normalize(row, col) + cell_type = cell_type_by_value(value) + set_value(row, col, value, sheet) + set_type(row, col, cell_type, sheet) + end + + def cell_type_by_value(value) + case value + when Integer then :float + when String, Float then :string + else + fail ArgumentError, "Type for #{value} not set" + end + end + + # reopens and read a spreadsheet document + def reload + ds = default_sheet + reinitialize + self.default_sheet = ds + end + + # true if cell is empty + def empty?(row, col, sheet = default_sheet) + read_cells(sheet) + row, col = normalize(row, col) + contents = cell(row, col, sheet) + !contents || (celltype(row, col, sheet) == :string && contents.empty?) \ + || (row < first_row(sheet) || row > last_row(sheet) || col < first_column(sheet) || col > last_column(sheet)) + end + + # returns information of the spreadsheet document and all sheets within + # this document. + def info + without_changing_default_sheet do + result = "File: #{File.basename(@filename)}\n"\ + "Number of sheets: #{sheets.size}\n"\ + "Sheets: #{sheets.join(', ')}\n" + n = 1 + sheets.each do|sheet| + self.default_sheet = sheet + result << 'Sheet ' + n.to_s + ":\n" + if first_row + result << " First row: #{first_row}\n" + result << " Last row: #{last_row}\n" + result << " First column: #{::Roo::Utils.number_to_letter(first_column)}\n" + result << " Last column: #{::Roo::Utils.number_to_letter(last_column)}" + else + result << ' - empty -' + end + result << "\n" if sheet != sheets.last + n += 1 + end + result + end + end + + # when a method like spreadsheet.a42 is called + # convert it to a call of spreadsheet.cell('a',42) + def method_missing(m, *args) + # #aa42 => #cell('aa',42) + # #aa42('Sheet1') => #cell('aa',42,'Sheet1') + if m =~ /^([a-z]+)(\d+)$/ + col = ::Roo::Utils.letter_to_number(Regexp.last_match[1]) + row = Regexp.last_match[2].to_i + if args.empty? + cell(row, col) + else + cell(row, col, args.first) + end + else + super + end + end + + # access different worksheets by calling spreadsheet.sheet(1) + # or spreadsheet.sheet('SHEETNAME') + def sheet(index, name = false) + self.default_sheet = index.is_a?(::String) ? index : sheets[index] + name ? [default_sheet, self] : self + end + + # iterate through all worksheets of a document + def each_with_pagename + sheets.each do |s| + yield sheet(s, true) + end + end + + # by passing in headers as options, this method returns + # specific columns from your header assignment + # for example: + # xls.sheet('New Prices').parse(:upc => 'UPC', :price => 'Price') would return: + # [{:upc => 123456789012, :price => 35.42},..] + + # the queries are matched with regex, so regex options can be passed in + # such as :price => '^(Cost|Price)' + # case insensitive by default + + # by using the :header_search option, you can query for headers + # and return a hash of every row with the keys set to the header result + # for example: + # xls.sheet('New Prices').parse(:header_search => ['UPC*SKU','^Price*\sCost\s']) + + # that example searches for a column titled either UPC or SKU and another + # column titled either Price or Cost (regex characters allowed) + # * is the wildcard character + + # you can also pass in a :clean => true option to strip the sheet of + # control characters and white spaces around columns + + def each(options = {}) + return to_enum(:each, options) unless block_given? + + if options.empty? + 1.upto(last_row) do |line| + yield row(line) + end + else + clean_sheet_if_need(options) + search_or_set_header(options) + headers = @headers || + Hash[(first_column..last_column).map do |col| + [cell(@header_line, col), col] + end] + + @header_line.upto(last_row) do |line| + yield(Hash[headers.map { |k, v| [k, cell(line, v)] }]) + end + end + end + + def parse(options = {}) + results = each(options).map do |row| + block_given? ? yield(row) : row + end + + options[:headers] == true ? results : results.drop(1) + end + + def row_with(query, return_headers = false) + line_no = 0 + each do |row| + line_no += 1 + headers = query.map { |q| row.grep(q)[0] }.compact + + if headers.length == query.length + @header_line = line_no + return return_headers ? headers : line_no + elsif line_no > 100 + raise Roo::HeaderRowNotFoundError + end + end + raise Roo::HeaderRowNotFoundError + end + + protected + + def file_type_check(filename, exts, name, warning_level, packed = nil) + if packed == :zip + # spreadsheet.ods.zip => spreadsheet.ods + # Decompression is not performed here, only the 'zip' extension + # is removed from the file. + filename = File.basename(filename, File.extname(filename)) + end + + if uri?(filename) && (qs_begin = filename.rindex('?')) + filename = filename[0..qs_begin - 1] + end + exts = Array(exts) + + return if exts.include?(File.extname(filename).downcase) + + case warning_level + when :error + warn file_type_warning_message(filename, exts) + fail TypeError, "#{filename} is not #{name} file" + when :warning + warn "are you sure, this is #{name} spreadsheet file?" + warn file_type_warning_message(filename, exts) + when :ignore + # ignore + else + fail "#{warning_level} illegal state of file_warning" + end + end + + # konvertiert einen Key in der Form "12,45" (=row,column) in + # ein Array mit numerischen Werten ([12,45]) + # Diese Methode ist eine temp. Loesung, um zu erforschen, ob der + # Zugriff mit numerischen Keys schneller ist. + def key_to_num(str) + r, c = str.split(',') + [r.to_i, c.to_i] + end + + # see: key_to_num + def key_to_string(arr) + "#{arr[0]},#{arr[1]}" + end + + def is_stream?(filename_or_stream) + filename_or_stream.respond_to?(:seek) + end + + private + + def clean_sheet_if_need(options) + return unless options[:clean] + options.delete(:clean) + @cleaned ||= {} + clean_sheet(default_sheet) unless @cleaned[default_sheet] + end + + def search_or_set_header(options) + if options[:header_search] + @headers = nil + @header_line = row_with(options[:header_search]) + elsif [:first_row, true].include?(options[:headers]) + @headers = [] + row(first_row).each_with_index { |x, i| @headers << [x, i + 1] } + else + set_headers(options) + end + end + + def local_filename(filename, tmpdir, packed) + return if is_stream?(filename) + filename = download_uri(filename, tmpdir) if uri?(filename) + filename = unzip(filename, tmpdir) if packed == :zip + + fail IOError, "file #{filename} does not exist" unless File.file?(filename) + + filename + end + + def file_type_warning_message(filename, exts) + *rest, last_ext = exts + ext_list = rest.any? ? "#{rest.join(', ')} or #{last_ext}" : last_ext + "use #{Roo::CLASS_FOR_EXTENSION.fetch(last_ext.sub('.', '').to_sym)}.new to handle #{ext_list} spreadsheet files. This has #{File.extname(filename).downcase}" + rescue KeyError + raise "unknown file types: #{ext_list}" + end + + def find_by_row(row_index) + row_index += (header_line - 1) if @header_line + + row(row_index).size.times.map do |cell_index| + cell(row_index, cell_index + 1) + end + end + + def find_by_conditions(options) + rows = first_row.upto(last_row) + header_for = Hash[1.upto(last_column).map do |col| + [col, cell(@header_line, col)] + end] + + # are all conditions met? + conditions = options[:conditions] + if conditions && !conditions.empty? + column_with = header_for.invert + rows = rows.select do |i| + conditions.all? { |key, val| cell(i, column_with[key]) == val } + end + end + + if options[:array] + rows.map { |i| row(i) } + else + rows.map do |i| + Hash[1.upto(row(i).size).map do |j| + [header_for.fetch(j), cell(i, j)] + end] + end + end + end + + def without_changing_default_sheet + original_default_sheet = default_sheet + yield + ensure + self.default_sheet = original_default_sheet + end + + def reinitialize + initialize(@filename) + end + + def find_basename(filename) + if uri?(filename) + require 'uri' + uri = URI.parse filename + File.basename(uri.path) + elsif !is_stream?(filename) + File.basename(filename) + end + end + + def make_tmpdir(prefix = nil, root = nil, &block) + warn '[DEPRECATION] extend Roo::Tempdir and use its .make_tempdir instead' + prefix = "#{Roo::TEMP_PREFIX}#{prefix}" + root ||= ENV['ROO_TMP'] + + if block_given? + # folder is deleted at end of block + ::Dir.mktmpdir(prefix, root, &block) + else + self.class.make_tempdir(self, prefix, root) + end + end + + def clean_sheet(sheet) + read_cells(sheet) + @cell[sheet].each_pair do |coord, value| + @cell[sheet][coord] = sanitize_value(value) if value.is_a?(::String) + end + @cleaned[sheet] = true + end + + def sanitize_value(v) + v.gsub(/[[:cntrl:]]|^[\p{Space}]+|[\p{Space}]+$/, '') + end + + def set_headers(hash = {}) + # try to find header row with all values or give an error + # then create new hash by indexing strings and keeping integers for header array + @headers = row_with(hash.values, true) + @headers = Hash[hash.keys.zip(@headers.map { |x| header_index(x) })] + end + + def header_index(query) + row(@header_line).index(query) + first_column + end + + def set_value(row, col, value, sheet = default_sheet) + @cell[sheet][[row, col]] = value + end + + def set_type(row, col, type, sheet = default_sheet) + @cell_type[sheet][[row, col]] = type + end + + # converts cell coordinate to numeric values of row,col + def normalize(row, col) + if row.is_a?(::String) + if col.is_a?(::Integer) + # ('A',1): + # ('B', 5) -> (5, 2) + row, col = col, row + else + fail ArgumentError + end + end + + col = ::Roo::Utils.letter_to_number(col) if col.is_a?(::String) + + [row, col] + end + + def uri?(filename) + filename.start_with?('http://', 'https://', 'ftp://') + rescue + false + end + + def download_uri(uri, tmpdir) + require 'open-uri' + tempfilename = File.join(tmpdir, find_basename(uri)) + begin + File.open(tempfilename, 'wb') do |file| + open(uri, 'User-Agent' => "Ruby/#{RUBY_VERSION}") do |net| + file.write(net.read) + end + end + rescue OpenURI::HTTPError + raise "could not open #{uri}" + end + tempfilename + end + + def open_from_stream(stream, tmpdir) + tempfilename = File.join(tmpdir, 'spreadsheet') + File.open(tempfilename, 'wb') do |file| + file.write(stream[7..-1]) + end + File.join(tmpdir, 'spreadsheet') + end + + def unzip(filename, tmpdir) + require 'zip/filesystem' + + Zip::File.open(filename) do |zip| + process_zipfile_packed(zip, tmpdir) + end + end + + # check if default_sheet was set and exists in sheets-array + def validate_sheet!(sheet) + case sheet + when nil + fail ArgumentError, "Error: sheet 'nil' not valid" + when Integer + sheets.fetch(sheet - 1) do + fail RangeError, "sheet index #{sheet} not found" + end + when String + unless sheets.include?(sheet) + fail RangeError, "sheet '#{sheet}' not found" + end + else + fail TypeError, "not a valid sheet type: #{sheet.inspect}" + end + end + + def process_zipfile_packed(zip, tmpdir, path = '') + if zip.file.file? path + # extract and return filename + File.open(File.join(tmpdir, path), 'wb') do |file| + file.write(zip.read(path)) + end + File.join(tmpdir, path) + else + ret = nil + path += '/' unless path.empty? + zip.dir.foreach(path) do |filename| + ret = process_zipfile_packed(zip, tmpdir, path + filename) + end + ret + end + end +end diff --git a/lib/roo/constants.rb b/lib/roo/constants.rb new file mode 100644 index 0000000..90d54ee --- /dev/null +++ b/lib/roo/constants.rb @@ -0,0 +1,5 @@ +module Roo + ROO_EXCEL_NOTICE = "Excel support has been extracted to roo-xls due to its dependency on the GPL'd spreadsheet gem. Install roo-xls to use Roo::Excel.".freeze + ROO_EXCELML_NOTICE = "Excel SpreadsheetML support has been extracted to roo-xls. Install roo-xls to use Roo::Excel2003XML.".freeze + ROO_GOOGLE_NOTICE = "Google support has been extracted to roo-google. Install roo-google to use Roo::Google.".freeze +end
\ No newline at end of file diff --git a/lib/roo/csv.rb b/lib/roo/csv.rb new file mode 100644 index 0000000..c161c64 --- /dev/null +++ b/lib/roo/csv.rb @@ -0,0 +1,127 @@ +require "csv" +require "time" + +# The CSV class can read csv files (must be separated with commas) which then +# can be handled like spreadsheets. This means you can access cells like A5 +# within these files. +# The CSV class provides only string objects. If you want conversions to other +# types you have to do it yourself. +# +# You can pass options to the underlying CSV parse operation, via the +# :csv_options option. +module Roo + class CSV < Roo::Base + attr_reader :filename + + # Returns an array with the names of the sheets. In CSV class there is only + # one dummy sheet, because a csv file cannot have more than one sheet. + def sheets + ["default"] + end + + def cell(row, col, sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + @cell[normalize(row, col)] + end + + def celltype(row, col, sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + @cell_type[normalize(row, col)] + end + + def cell_postprocessing(_row, _col, value) + value + end + + def csv_options + @options[:csv_options] || {} + end + + def set_value(row, col, value, _sheet) + @cell[[row, col]] = value + end + + def set_type(row, col, type, _sheet) + @cell_type[[row, col]] = type + end + + private + + TYPE_MAP = { + String => :string, + Float => :float, + Date => :date, + DateTime => :datetime, + } + + def celltype_class(value) + TYPE_MAP[value.class] + end + + def read_cells(sheet = default_sheet) + sheet ||= default_sheet + return if @cells_read[sheet] + set_row_count(sheet) + set_column_count(sheet) + row_num = 1 + + each_row csv_options do |row| + row.each_with_index do |elem, col_num| + coordinate = [row_num, col_num + 1] + @cell[coordinate] = elem + @cell_type[coordinate] = celltype_class(elem) + end + row_num += 1 + end + + @cells_read[sheet] = true + end + + def each_row(options, &block) + if uri?(filename) + each_row_using_temp_dir(filename) + elsif is_stream?(filename_or_stream) + ::CSV.new(filename_or_stream, options).each(&block) + else + ::CSV.foreach(filename, options, &block) + end + end + + def each_row_using_tempdir + ::Dir.mktmpdir(Roo::TEMP_PREFIX, ENV["ROO_TMP"]) do |tmpdir| + tmp_filename = download_uri(filename, tmpdir) + ::CSV.foreach(tmp_filename, options, &block) + end + end + + def set_row_count(sheet) + @first_row[sheet] = 1 + @last_row[sheet] = ::CSV.readlines(@filename, csv_options).size + @last_row[sheet] = @first_row[sheet] if @last_row[sheet].zero? + + nil + end + + def set_column_count(sheet) + @first_column[sheet] = 1 + @last_column[sheet] = (::CSV.readlines(@filename, csv_options).first || []).size + @last_column[sheet] = @first_column[sheet] if @last_column[sheet].zero? + + nil + end + + def clean_sheet(sheet) + read_cells(sheet) + + @cell.each_pair do |coord, value| + @cell[coord] = sanitize_value(value) if value.is_a?(::String) + end + + @cleaned[sheet] = true + end + + alias_method :filename_or_stream, :filename + end +end diff --git a/lib/roo/errors.rb b/lib/roo/errors.rb new file mode 100644 index 0000000..9a736b8 --- /dev/null +++ b/lib/roo/errors.rb @@ -0,0 +1,11 @@ +module Roo + # A base error class for Roo. Most errors thrown by Roo should inherit from + # this class. + class Error < StandardError; end + + # Raised when Roo cannot find a header row that matches the given column + # name(s). + class HeaderRowNotFoundError < Error; end + + class FileNotFound < Error; end +end diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb new file mode 100644 index 0000000..82c1431 --- /dev/null +++ b/lib/roo/excelx.rb @@ -0,0 +1,448 @@ +require 'nokogiri' +require 'zip/filesystem' +require 'roo/link' +require 'roo/tempdir' +require 'roo/utils' +require 'forwardable' +require 'set' + +module Roo + class Excelx < Roo::Base + extend Roo::Tempdir + extend Forwardable + + ERROR_VALUES = %w(#N/A #REF! #NAME? #DIV/0! #NULL! #VALUE! #NUM!).to_set + + require 'roo/excelx/shared' + require 'roo/excelx/workbook' + require 'roo/excelx/shared_strings' + require 'roo/excelx/styles' + require 'roo/excelx/cell' + require 'roo/excelx/sheet' + require 'roo/excelx/relationships' + require 'roo/excelx/comments' + require 'roo/excelx/sheet_doc' + require 'roo/excelx/coordinate' + require 'roo/excelx/format' + + delegate [:styles, :workbook, :shared_strings, :rels_files, :sheet_files, :comments_files] => :@shared + ExceedsMaxError = Class.new(StandardError) + + # initialization and opening of a spreadsheet file + # values for packed: :zip + # optional cell_max (int) parameter for early aborting attempts to parse + # enormous documents. + def initialize(filename_or_stream, options = {}) + packed = options[:packed] + file_warning = options.fetch(:file_warning, :error) + cell_max = options.delete(:cell_max) + sheet_options = {} + sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false) + sheet_options[:no_hyperlinks] = (options[:no_hyperlinks] || false) + + unless is_stream?(filename_or_stream) + file_type_check(filename_or_stream, %w[.xlsx .xlsm], 'an Excel 2007', file_warning, packed) + basename = find_basename(filename_or_stream) + end + + # NOTE: Create temp directory and allow Ruby to cleanup the temp directory + # when the object is garbage collected. Initially, the finalizer was + # created in the Roo::Tempdir module, but that led to a segfault + # when testing in Ruby 2.4.0. + @tmpdir = self.class.make_tempdir(self, basename, options[:tmpdir_root]) + ObjectSpace.define_finalizer(self, self.class.finalize(object_id)) + + @shared = Shared.new(@tmpdir) + @filename = local_filename(filename_or_stream, @tmpdir, packed) + process_zipfile(@filename || filename_or_stream) + + @sheet_names = workbook.sheets.map do |sheet| + unless options[:only_visible_sheets] && sheet['state'] == 'hidden' + sheet['name'] + end + end.compact + @sheets = [] + @sheets_by_name = Hash[@sheet_names.map.with_index do |sheet_name, n| + @sheets[n] = Sheet.new(sheet_name, @shared, n, sheet_options) + [sheet_name, @sheets[n]] + end] + + if cell_max + cell_count = ::Roo::Utils.num_cells_in_range(sheet_for(options.delete(:sheet)).dimensions) + raise ExceedsMaxError.new("Excel file exceeds cell maximum: #{cell_count} > #{cell_max}") if cell_count > cell_max + end + + super + rescue + self.class.finalize_tempdirs(object_id) + raise + end + + def method_missing(method, *args) + if (label = workbook.defined_names[method.to_s]) + safe_send(sheet_for(label.sheet).cells[label.key], :value) + else + # call super for methods like #a1 + super + end + end + + def sheets + @sheet_names + end + + def sheet_for(sheet) + sheet ||= default_sheet + validate_sheet!(sheet) + @sheets_by_name[sheet] + end + + # Returns the content of a spreadsheet-cell. + # (1,1) is the upper left corner. + # (1,1), (1,'A'), ('A',1), ('a',1) all refers to the + # cell at the first line and first row. + def cell(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :value) + end + + def row(rownumber, sheet = nil) + sheet_for(sheet).row(rownumber) + end + + # returns all values in this column as an array + # column numbers are 1,2,3,... like in the spreadsheet + def column(column_number, sheet = nil) + if column_number.is_a?(::String) + column_number = ::Roo::Utils.letter_to_number(column_number) + end + sheet_for(sheet).column(column_number) + end + + # returns the number of the first non-empty row + def first_row(sheet = nil) + sheet_for(sheet).first_row + end + + # returns the number of the last non-empty row + def last_row(sheet = nil) + sheet_for(sheet).last_row + end + + # returns the number of the first non-empty column + def first_column(sheet = nil) + sheet_for(sheet).first_column + end + + # returns the number of the last non-empty column + def last_column(sheet = nil) + sheet_for(sheet).last_column + end + + # set a cell to a certain value + # (this will not be saved back to the spreadsheet file!) + def set(row, col, value, sheet = nil) #:nodoc: + key = normalize(row, col) + cell_type = cell_type_by_value(value) + sheet_for(sheet).cells[key] = Cell.new(value, cell_type, nil, cell_type, value, nil, nil, nil, Coordinate.new(row, col)) + end + + # Returns the formula at (row,col). + # Returns nil if there is no formula. + # The method #formula? checks if there is a formula. + def formula(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :formula) + end + + # Predicate methods really should return a boolean + # value. Hopefully no one was relying on the fact that this + # previously returned either nil/formula + def formula?(*args) + !!formula(*args) + end + + # returns each formula in the selected sheet as an array of tuples in following format + # [[row, col, formula], [row, col, formula],...] + def formulas(sheet = nil) + sheet_for(sheet).cells.select { |_, cell| cell.formula }.map do |(x, y), cell| + [x, y, cell.formula] + end + end + + # Given a cell, return the cell's style + def font(row, col, sheet = nil) + key = normalize(row, col) + definition_index = safe_send(sheet_for(sheet).cells[key], :style) + styles.definitions[definition_index] if definition_index + end + + # returns the type of a cell: + # * :float + # * :string, + # * :date + # * :percentage + # * :formula + # * :time + # * :datetime + def celltype(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :type) + end + + # returns the internal type of an excel cell + # * :numeric_or_formula + # * :string + # Note: this is only available within the Excelx class + def excelx_type(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :cell_type) + end + + # returns the internal value of an excelx cell + # Note: this is only available within the Excelx class + def excelx_value(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :cell_value) + end + + # returns the internal value of an excelx cell + # Note: this is only available within the Excelx class + def formatted_value(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :formatted_value) + end + + # returns the internal format of an excel cell + def excelx_format(row, col, sheet = nil) + key = normalize(row, col) + sheet_for(sheet).excelx_format(key) + end + + def empty?(row, col, sheet = nil) + sheet = sheet_for(sheet) + key = normalize(row, col) + cell = sheet.cells[key] + !cell || cell.empty? || + (row < sheet.first_row || row > sheet.last_row || col < sheet.first_column || col > sheet.last_column) + end + + # shows the internal representation of all cells + # for debugging purposes + def to_s(sheet = nil) + sheet_for(sheet).cells.inspect + end + + # returns the row,col values of the labelled cell + # (nil,nil) if label is not defined + def label(name) + labels = workbook.defined_names + return [nil, nil, nil] if labels.empty? || !labels.key?(name) + + [labels[name].row, labels[name].col, labels[name].sheet] + end + + # Returns an array which all labels. Each element is an array with + # [labelname, [row,col,sheetname]] + def labels + @labels ||= workbook.defined_names.map do |name, label| + [ + name, + [label.row, label.col, label.sheet] + ] + end + end + + def hyperlink?(row, col, sheet = nil) + !!hyperlink(row, col, sheet) + end + + # returns the hyperlink at (row/col) + # nil if there is no hyperlink + def hyperlink(row, col, sheet = nil) + key = normalize(row, col) + sheet_for(sheet).hyperlinks[key] + end + + # returns the comment at (row/col) + # nil if there is no comment + def comment(row, col, sheet = nil) + key = normalize(row, col) + sheet_for(sheet).comments[key] + end + + # true, if there is a comment + def comment?(row, col, sheet = nil) + !!comment(row, col, sheet) + end + + def comments(sheet = nil) + sheet_for(sheet).comments.map do |(x, y), comment| + [x, y, comment] + end + end + + # Yield an array of Excelx::Cell + # Takes options for sheet, pad_cells, and max_rows + def each_row_streaming(options = {}) + sheet = sheet_for(options.delete(:sheet)) + if block_given? + sheet.each_row(options) { |row| yield row } + else + sheet.to_enum(:each_row, options) + end + end + + private + + def clean_sheet(sheet) + @sheets_by_name[sheet].cells.each_pair do |coord, value| + next unless value.value.is_a?(::String) + + @sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value) + end + + @cleaned[sheet] = true + end + + # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx + # documents require a workbook.xml file, so a if the file is missing + # it is not a valid xlsx file. In these cases, an ArgumentError is + # raised. + # + # wb - a Zip::Entry for the workbook.xml file. + # path - A String for Zip::Entry's destination path. + # + # Examples + # + # extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml') + # # => ["rId1", "rId2", "rId3"] + # + # Returns an Array of Strings. + def extract_worksheet_ids(entries, path) + wb = entries.find { |e| e.name[/workbook.xml$/] } + fail ArgumentError 'missing required workbook file' if wb.nil? + + wb.extract(path) + workbook_doc = Roo::Utils.load_xml(path).remove_namespaces! + workbook_doc.xpath('//sheet').map { |s| s.attributes['id'].value } + end + + # Internal + # + # wb_rels - A Zip::Entry for the workbook.xml.rels file. + # path - A String for the Zip::Entry's destination path. + # + # Examples + # + # extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels') + # # => { + # "rId1"=>"worksheets/sheet1.xml", + # "rId2"=>"worksheets/sheet2.xml", + # "rId3"=>"worksheets/sheet3.xml" + # } + # + # Returns a Hash. + def extract_worksheet_rels(entries, path) + wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] } + fail ArgumentError 'missing required workbook file' if wb_rels.nil? + + wb_rels.extract(path) + rels_doc = Roo::Utils.load_xml(path).remove_namespaces! + worksheet_type = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet' + + relationships = rels_doc.xpath('//Relationship').select do |relationship| + relationship.attributes['Type'].value == worksheet_type + end + + relationships.inject({}) do |hash, relationship| + attributes = relationship.attributes + id = attributes['Id'] + hash[id.value] = attributes['Target'].value + hash + end + end + + # Extracts the sheets in order, but it will ignore sheets that are not + # worksheets. + def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir) + (sheet_ids & sheets.keys).each_with_index do |id, i| + name = sheets[id] + entry = entries.find { |e| "/#{e.name}" =~ /#{name}$/ } + path = "#{tmpdir}/roo_sheet#{i + 1}" + sheet_files << path + @sheet_files << path + entry.extract(path) + end + end + + # Extracts all needed files from the zip file + def process_zipfile(zipfilename_or_stream) + @sheet_files = [] + + unless is_stream?(zipfilename_or_stream) + zip_file = Zip::File.open(zipfilename_or_stream) + else + zip_file = Zip::CentralDirectory.new + zip_file.read_from_stream zipfilename_or_stream + end + + process_zipfile_entries zip_file.to_a.sort_by(&:name) + end + + def process_zipfile_entries(entries) + # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames + # are not in order. With Numbers 3.1, the first sheet is always + # sheet.xml, not sheet1.xml. With Google, the order of the worksheets is + # independent of a worksheet's filename (i.e. sheet6.xml can be the + # first worksheet). + # + # workbook.xml lists the correct order of worksheets and + # workbook.xml.rels lists the filenames for those worksheets. + # + # workbook.xml: + # <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/> + # <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/> + # workbook.xml.rel: + # <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/> + # <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/> + sheet_ids = extract_worksheet_ids(entries, "#{@tmpdir}/roo_workbook.xml") + sheets = extract_worksheet_rels(entries, "#{@tmpdir}/roo_workbook.xml.rels") + extract_sheets_in_order(entries, sheet_ids, sheets, @tmpdir) + + entries.each do |entry| + path = + case entry.name.downcase + when /sharedstrings.xml$/ + "#{@tmpdir}/roo_sharedStrings.xml" + when /styles.xml$/ + "#{@tmpdir}/roo_styles.xml" + when /comments([0-9]+).xml$/ + # FIXME: Most of the time, The order of the comment files are the same + # the sheet order, i.e. sheet1.xml's comments are in comments1.xml. + # In some situations, this isn't true. The true location of a + # sheet's comment file is in the sheet1.xml.rels file. SEE + # ECMA-376 12.3.3 in "Ecma Office Open XML Part 1". + nr = Regexp.last_match[1].to_i + comments_files[nr - 1] = "#{@tmpdir}/roo_comments#{nr}" + when %r{chartsheets/_rels/sheet([0-9]+).xml.rels$} + # NOTE: Chart sheet relationship files were interfering with + # worksheets. + nil + when /sheet([0-9]+).xml.rels$/ + # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but + # it also stores the location for sharedStrings, comments, + # drawings, etc. + nr = Regexp.last_match[1].to_i + rels_files[nr - 1] = "#{@tmpdir}/roo_rels#{nr}" + end + + entry.extract(path) if path + end + end + + def safe_send(object, method, *args) + object.send(method, *args) if object && object.respond_to?(method) + end + end +end diff --git a/lib/roo/excelx/cell.rb b/lib/roo/excelx/cell.rb new file mode 100644 index 0000000..2fc78e1 --- /dev/null +++ b/lib/roo/excelx/cell.rb @@ -0,0 +1,106 @@ +require 'date' +require 'roo/excelx/cell/base' +require 'roo/excelx/cell/boolean' +require 'roo/excelx/cell/datetime' +require 'roo/excelx/cell/date' +require 'roo/excelx/cell/empty' +require 'roo/excelx/cell/number' +require 'roo/excelx/cell/string' +require 'roo/excelx/cell/time' + +module Roo + class Excelx + class Cell + attr_reader :formula, :value, :excelx_type, :excelx_value, :style, :hyperlink, :coordinate + attr_writer :value + + # DEPRECATED: Please use Cell.create_cell instead. + def initialize(value, type, formula, excelx_type, excelx_value, style, hyperlink, base_date, coordinate) + warn '[DEPRECATION] `Cell.new` is deprecated. Please use `Cell.create_cell` instead.' + @type = type + @formula = formula + @base_date = base_date if [:date, :datetime].include?(@type) + @excelx_type = excelx_type + @excelx_value = excelx_value + @style = style + @value = type_cast_value(value) + @value = Roo::Link.new(hyperlink, @value.to_s) if hyperlink + @coordinate = coordinate + end + + def type + case + when @formula + :formula + when @value.is_a?(Roo::Link) + :link + else + @type + end + end + + def self.create_cell(type, *values) + case type + when :string + Cell::String.new(*values) + when :boolean + Cell::Boolean.new(*values) + when :number + Cell::Number.new(*values) + when :date + Cell::Date.new(*values) + when :datetime + Cell::DateTime.new(*values) + when :time + Cell::Time.new(*values) + end + end + + # Deprecated: use Roo::Excelx::Coordinate instead. + class Coordinate + attr_accessor :row, :column + + def initialize(row, column) + warn '[DEPRECATION] `Roo::Excel::Cell::Coordinate` is deprecated. Please use `Roo::Excelx::Coordinate` instead.' + @row, @column = row, column + end + end + + private + + def type_cast_value(value) + case @type + when :float, :percentage + value.to_f + when :date + create_date(@base_date + value.to_i) + when :datetime + create_datetime(@base_date + value.to_f.round(6)) + when :time + value.to_f * 86_400 + else + value + end + end + + def create_date(date) + yyyy, mm, dd = date.strftime('%Y-%m-%d').split('-') + + ::Date.new(yyyy.to_i, mm.to_i, dd.to_i) + end + + def create_datetime(date) + datetime_string = date.strftime('%Y-%m-%d %H:%M:%S.%N') + t = round_datetime(datetime_string) + + ::DateTime.civil(t.year, t.month, t.day, t.hour, t.min, t.sec) + end + + def round_datetime(datetime_string) + /(?<yyyy>\d+)-(?<mm>\d+)-(?<dd>\d+) (?<hh>\d+):(?<mi>\d+):(?<ss>\d+.\d+)/ =~ datetime_string + + ::Time.new(yyyy.to_i, mm.to_i, dd.to_i, hh.to_i, mi.to_i, ss.to_r).round(0) + end + end + end +end diff --git a/lib/roo/excelx/cell/base.rb b/lib/roo/excelx/cell/base.rb new file mode 100644 index 0000000..aea8808 --- /dev/null +++ b/lib/roo/excelx/cell/base.rb @@ -0,0 +1,94 @@ +module Roo + class Excelx + class Cell + class Base + attr_reader :cell_type, :cell_value, :value + + # FIXME: I think style should be deprecated. Having a style attribute + # for a cell doesn't really accomplish much. It seems to be used + # when you want to export to excelx. + attr_reader :style + + + # FIXME: Updating a cell's value should be able tochange the cell's type, + # but that isn't currently possible. This will cause weird bugs + # when one changes the value of a Number cell to a String. e.g. + # + # cell = Cell::Number(*args) + # cell.value = 'Hello' + # cell.formatted_value # => Some unexpected value + # + # Here are two possible solutions to such issues: + # 1. Don't allow a cell's value to be updated. Use a method like + # `Sheet.update_cell` instead. The simple solution. + # 2. When `cell.value = ` is called, use injection to try and + # change the type of cell on the fly. But deciding what type + # of value to pass to `cell.value=`. isn't always obvious. e.g. + # `cell.value = Time.now` should convert a cell to a DateTime, + # not a Time cell. Time cells would be hard to recognize because + # they are integers. This approach would require a significant + # change to the code as written. The complex solution. + # + # If the first solution is used, then this method should be + # deprecated. + attr_writer :value + + def initialize(value, formula, excelx_type, style, link, coordinate) + @link = !!link + @cell_value = value + @cell_type = excelx_type + @formula = formula + @style = style + @coordinate = coordinate + @type = :base + @value = link? ? Roo::Link.new(link, value) : value + end + + def type + if formula? + :formula + elsif link? + :link + else + @type + end + end + + def formula? + !!@formula + end + + def link? + !!@link + end + + alias_method :formatted_value, :value + + def to_s + formatted_value + end + + # DEPRECATED: Please use link instead. + def hyperlink + warn '[DEPRECATION] `hyperlink` is deprecated. Please use `link` instead.' + end + + # DEPRECATED: Please use cell_value instead. + def excelx_value + warn '[DEPRECATION] `excelx_value` is deprecated. Please use `cell_value` instead.' + cell_value + end + + # DEPRECATED: Please use cell_type instead. + def excelx_type + warn '[DEPRECATION] `excelx_type` is deprecated. Please use `cell_type` instead.' + cell_type + end + + def empty? + false + end + end + end + end +end diff --git a/lib/roo/excelx/cell/boolean.rb b/lib/roo/excelx/cell/boolean.rb new file mode 100644 index 0000000..fe1f691 --- /dev/null +++ b/lib/roo/excelx/cell/boolean.rb @@ -0,0 +1,27 @@ +module Roo + class Excelx + class Cell + class Boolean < Cell::Base + attr_reader :value, :formula, :format, :cell_type, :cell_value, :link, :coordinate + + def initialize(value, formula, style, link, coordinate) + super(value, formula, nil, style, link, coordinate) + @type = @cell_type = :boolean + @value = link? ? Roo::Link.new(link, value) : create_boolean(value) + end + + def formatted_value + value ? 'TRUE'.freeze : 'FALSE'.freeze + end + + private + + def create_boolean(value) + # FIXME: Using a boolean will cause methods like Base#to_csv to fail. + # Roo is using some method to ignore false/nil values. + value.to_i == 1 ? true : false + end + end + end + end +end diff --git a/lib/roo/excelx/cell/date.rb b/lib/roo/excelx/cell/date.rb new file mode 100644 index 0000000..8e2c6cb --- /dev/null +++ b/lib/roo/excelx/cell/date.rb @@ -0,0 +1,28 @@ +require 'date' + +module Roo + class Excelx + class Cell + class Date < Roo::Excelx::Cell::DateTime + attr_reader :value, :formula, :format, :cell_type, :cell_value, :link, :coordinate + + def initialize(value, formula, excelx_type, style, link, base_date, coordinate) + # NOTE: Pass all arguments to the parent class, DateTime. + super + @type = :date + @format = excelx_type.last + @value = link? ? Roo::Link.new(link, value) : create_date(base_date, value) + end + + private + + def create_date(base_date, value) + date = base_date + value.to_i + yyyy, mm, dd = date.strftime('%Y-%m-%d').split('-') + + ::Date.new(yyyy.to_i, mm.to_i, dd.to_i) + end + end + end + end +end diff --git a/lib/roo/excelx/cell/datetime.rb b/lib/roo/excelx/cell/datetime.rb new file mode 100644 index 0000000..35d93ac --- /dev/null +++ b/lib/roo/excelx/cell/datetime.rb @@ -0,0 +1,111 @@ +require 'date' + +module Roo + class Excelx + class Cell + class DateTime < Cell::Base + attr_reader :value, :formula, :format, :cell_value, :link, :coordinate + + def initialize(value, formula, excelx_type, style, link, base_date, coordinate) + super(value, formula, excelx_type, style, link, coordinate) + @type = :datetime + @format = excelx_type.last + @value = link? ? Roo::Link.new(link, value) : create_datetime(base_date, value) + end + + # Public: Returns formatted value for a datetime. Format's can be an + # standard excel format, or a custom format. + # + # Standard formats follow certain conventions. Date fields for + # days, months, and years are separated with hyhens or + # slashes ("-", /") (e.g. 01-JAN, 1/13/15). Time fields for + # hours, minutes, and seconds are separated with a colon (e.g. + # 12:45:01). + # + # If a custom format follows those conventions, then the custom + # format will be used for the a cell's formatted value. + # Otherwise, the formatted value will be in the following + # format: 'YYYY-mm-dd HH:MM:SS' (e.g. "2015-07-10 20:33:15"). + # + # Examples + # formatted_value #=> '01-JAN' + # + # Returns a String representation of a cell's value. + def formatted_value + formatter = @format.downcase.split(' ').map do |part| + if (parsed_format = parse_date_or_time_format(part)) + parsed_format + else + warn 'Unable to parse custom format. Using "YYYY-mm-dd HH:MM:SS" format.' + return @value.strftime('%F %T') + end + end.join(' ') + + @value.strftime(formatter) + end + + private + + def parse_date_or_time_format(part) + date_regex = /(?<date>[dmy]+[\-\/][dmy]+([\-\/][dmy]+)?)/ + time_regex = /(?<time>(\[?[h]\]?+:)?[m]+(:?ss|:?s)?)/ + + if part[date_regex] == part + formats = DATE_FORMATS + elsif part[time_regex] + formats = TIME_FORMATS + else + return false + end + + part.gsub(/#{formats.keys.join('|')}/, formats) + end + + DATE_FORMATS = { + 'yyyy' => '%Y', # Year: 2000 + 'yy' => '%y', # Year: 00 + # mmmmm => J-D + 'mmmm' => '%B', # Month: January + 'mmm' => '%^b', # Month: JAN + 'mm' => '%m', # Month: 01 + 'm' => '%-m', # Month: 1 + 'dddd' => '%A', # Day of the Week: Sunday + 'ddd' => '%^a', # Day of the Week: SUN + 'dd' => '%d', # Day of the Month: 01 + 'd' => '%-d' # Day of the Month: 1 + # '\\\\'.freeze => ''.freeze, # NOTE: Fixes a custom format's output. + } + + TIME_FORMATS = { + 'hh' => '%H', # Hour (24): 01 + 'h' => '%-k'.freeze, # Hour (24): 1 + # 'hh'.freeze => '%I'.freeze, # Hour (12): 08 + # 'h'.freeze => '%-l'.freeze, # Hour (12): 8 + 'mm' => '%M', # Minute: 01 + # FIXME: is this used? Seems like 'm' is used for month, not minute. + 'm' => '%-M', # Minute: 1 + 'ss' => '%S', # Seconds: 01 + 's' => '%-S', # Seconds: 1 + 'am/pm' => '%p', # Meridian: AM + '000' => '%3N', # Fractional Seconds: thousandth. + '00' => '%2N', # Fractional Seconds: hundredth. + '0' => '%1N' # Fractional Seconds: tenths. + } + + def create_datetime(base_date, value) + date = base_date + value.to_f.round(6) + datetime_string = date.strftime('%Y-%m-%d %H:%M:%S.%N') + t = round_datetime(datetime_string) + + ::DateTime.civil(t.year, t.month, t.day, t.hour, t.min, t.sec) + end + + def round_datetime(datetime_string) + /(?<yyyy>\d+)-(?<mm>\d+)-(?<dd>\d+) (?<hh>\d+):(?<mi>\d+):(?<ss>\d+.\d+)/ =~ datetime_string + + ::Time.new(yyyy, mm, dd, hh, mi, ss.to_r).round(0) + end + end + end + end +end diff --git a/lib/roo/excelx/cell/empty.rb b/lib/roo/excelx/cell/empty.rb new file mode 100644 index 0000000..49a20e7 --- /dev/null +++ b/lib/roo/excelx/cell/empty.rb @@ -0,0 +1,19 @@ + +module Roo + class Excelx + class Cell + class Empty < Cell::Base + attr_reader :value, :formula, :format, :cell_type, :cell_value, :hyperlink, :coordinate + + def initialize(coordinate) + @value = @formula = @format = @cell_type = @cell_value = @hyperlink = nil + @coordinate = coordinate + end + + def empty? + true + end + end + end + end +end diff --git a/lib/roo/excelx/cell/number.rb b/lib/roo/excelx/cell/number.rb new file mode 100644 index 0000000..2015562 --- /dev/null +++ b/lib/roo/excelx/cell/number.rb @@ -0,0 +1,87 @@ +module Roo + class Excelx + class Cell + class Number < Cell::Base + attr_reader :value, :formula, :format, :cell_value, :link, :coordinate + + def initialize(value, formula, excelx_type, style, link, coordinate) + super + # FIXME: change @type to number. This will break brittle tests. + # FIXME: Excelx_type is an array, but the first value isn't used. + @type = :float + @format = excelx_type.last + @value = link? ? Roo::Link.new(link, value) : create_numeric(value) + end + + def create_numeric(number) + return number if Excelx::ERROR_VALUES.include?(number) + case @format + when /%/ + Float(number) + when /\.0/ + Float(number) + else + (number.include?('.') || (/\A[-+]?\d+E[-+]\d+\z/i =~ number)) ? Float(number) : Integer(number) + end + end + + def formatted_value + return @cell_value if Excelx::ERROR_VALUES.include?(@cell_value) + + formatter = formats[@format] + if formatter.is_a? Proc + formatter.call(@cell_value) + elsif zero_padded_number? + "%0#{@format.size}d" % @cell_value + else + Kernel.format(formatter, @cell_value) + end + end + + def formats + # FIXME: numbers can be other colors besides red: + # [BLACK], [BLUE], [CYAN], [GREEN], [MAGENTA], [RED], [WHITE], [YELLOW], [COLOR n] + { + 'General' => '%.0f', + '0' => '%.0f', + '0.00' => '%.2f', + '0.000000' => '%.6f', + '#,##0' => number_format('%.0f'), + '#,##0.00' => number_format('%.2f'), + '0%' => proc do |number| + Kernel.format('%d%', number.to_f * 100) + end, + '0.00%' => proc do |number| + Kernel.format('%.2f%', number.to_f * 100) + end, + '0.00E+00' => '%.2E', + '#,##0 ;(#,##0)' => number_format('%.0f', '(%.0f)'), + '#,##0 ;[Red](#,##0)' => number_format('%.0f', '[Red](%.0f)'), + '#,##0.00;(#,##0.00)' => number_format('%.2f', '(%.2f)'), + '#,##0.00;[Red](#,##0.00)' => number_format('%.2f', '[Red](%.2f)'), + # FIXME: not quite sure what the format should look like in this case. + '##0.0E+0' => '%.1E', + '@' => proc { |number| number } + } + end + + private + + def number_format(formatter, negative_formatter = nil) + proc do |number| + if negative_formatter + formatter = number.to_i > 0 ? formatter : negative_formatter + number = number.to_f.abs + end + + Kernel.format(formatter, number).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse + end + end + + def zero_padded_number? + @format[/0+/] == @format + end + end + end + end +end diff --git a/lib/roo/excelx/cell/string.rb b/lib/roo/excelx/cell/string.rb new file mode 100644 index 0000000..7967806 --- /dev/null +++ b/lib/roo/excelx/cell/string.rb @@ -0,0 +1,19 @@ +module Roo + class Excelx + class Cell + class String < Cell::Base + attr_reader :value, :formula, :format, :cell_type, :cell_value, :link, :coordinate + + def initialize(value, formula, style, link, coordinate) + super(value, formula, nil, style, link, coordinate) + @type = @cell_type = :string + @value = link? ? Roo::Link.new(link, value) : value + end + + def empty? + value.empty? + end + end + end + end +end diff --git a/lib/roo/excelx/cell/time.rb b/lib/roo/excelx/cell/time.rb new file mode 100644 index 0000000..d661ab8 --- /dev/null +++ b/lib/roo/excelx/cell/time.rb @@ -0,0 +1,43 @@ +require 'date' + +module Roo + class Excelx + class Cell + class Time < Roo::Excelx::Cell::DateTime + attr_reader :value, :formula, :format, :cell_value, :link, :coordinate + + def initialize(value, formula, excelx_type, style, link, base_date, coordinate) + # NOTE: Pass all arguments to DateTime super class. + super + @type = :time + @format = excelx_type.last + @datetime = create_datetime(base_date, value) + @value = link? ? Roo::Link.new(link, value) : (value.to_f * 86_400).to_i + end + + def formatted_value + formatter = @format.gsub(/#{TIME_FORMATS.keys.join('|')}/, TIME_FORMATS) + @datetime.strftime(formatter) + end + + alias_method :to_s, :formatted_value + + private + + # def create_datetime(base_date, value) + # date = base_date + value.to_f.round(6) + # datetime_string = date.strftime('%Y-%m-%d %H:%M:%S.%N') + # t = round_datetime(datetime_string) + # + # ::DateTime.civil(t.year, t.month, t.day, t.hour, t.min, t.sec) + # end + + # def round_datetime(datetime_string) + # /(?<yyyy>\d+)-(?<mm>\d+)-(?<dd>\d+) (?<hh>\d+):(?<mi>\d+):(?<ss>\d+.\d+)/ =~ datetime_string + # + # ::Time.new(yyyy.to_i, mm.to_i, dd.to_i, hh.to_i, mi.to_i, ss.to_r).round(0) + # end + end + end + end +end diff --git a/lib/roo/excelx/comments.rb b/lib/roo/excelx/comments.rb new file mode 100644 index 0000000..1a89908 --- /dev/null +++ b/lib/roo/excelx/comments.rb @@ -0,0 +1,55 @@ +require 'roo/excelx/extractor' + +module Roo + class Excelx + class Comments < Excelx::Extractor + def comments + @comments ||= extract_comments + end + + private + + def extract_comments + return {} unless doc_exists? + + Hash[doc.xpath('//comments/commentList/comment').map do |comment| + value = (comment.at_xpath('./text/r/t') || comment.at_xpath('./text/t')).text + [::Roo::Utils.ref_to_key(comment.attributes['ref'].to_s), value] + end] + end + end + end +end +# xl/comments1.xml +# <?xml version="1.0" encoding="UTF-8" standalone="yes" ?> +# <comments xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"> +# <authors> +# <author /> +# </authors> +# <commentList> +# <comment ref="B4" authorId="0"> +# <text> +# <r> +# <rPr> +# <sz val="10" /> +# <rFont val="Arial" /> +# <family val="2" /> +# </rPr> +# <t>Comment for B4</t> +# </r> +# </text> +# </comment> +# <comment ref="B5" authorId="0"> +# <text> +# <r> +# <rPr> +# <sz val="10" /> +# <rFont val="Arial" /> +# <family val="2" /> +# </rPr> +# <t>Comment for B5</t> +# </r> +# </text> +# </comment> +# </commentList> +# </comments> diff --git a/lib/roo/excelx/coordinate.rb b/lib/roo/excelx/coordinate.rb new file mode 100644 index 0000000..53b24ba --- /dev/null +++ b/lib/roo/excelx/coordinate.rb @@ -0,0 +1,12 @@ +module Roo + class Excelx + class Coordinate + attr_accessor :row, :column + + def initialize(row, column) + @row = row + @column = column + end + end + end +end diff --git a/lib/roo/excelx/extractor.rb b/lib/roo/excelx/extractor.rb new file mode 100644 index 0000000..1cdd13b --- /dev/null +++ b/lib/roo/excelx/extractor.rb @@ -0,0 +1,21 @@ +module Roo + class Excelx + class Extractor + def initialize(path) + @path = path + end + + private + + def doc + raise FileNotFound, "#{@path} file not found" unless doc_exists? + + ::Roo::Utils.load_xml(@path).remove_namespaces! + end + + def doc_exists? + @path && File.exist?(@path) + end + end + end +end diff --git a/lib/roo/excelx/format.rb b/lib/roo/excelx/format.rb new file mode 100644 index 0000000..72b36d9 --- /dev/null +++ b/lib/roo/excelx/format.rb @@ -0,0 +1,64 @@ +module Roo + class Excelx + module Format + EXCEPTIONAL_FORMATS = { + 'h:mm am/pm' => :date, + 'h:mm:ss am/pm' => :date + } + + STANDARD_FORMATS = { + 0 => 'General'.freeze, + 1 => '0'.freeze, + 2 => '0.00'.freeze, + 3 => '#,##0'.freeze, + 4 => '#,##0.00'.freeze, + 9 => '0%'.freeze, + 10 => '0.00%'.freeze, + 11 => '0.00E+00'.freeze, + 12 => '# ?/?'.freeze, + 13 => '# ??/??'.freeze, + 14 => 'mm-dd-yy'.freeze, + 15 => 'd-mmm-yy'.freeze, + 16 => 'd-mmm'.freeze, + 17 => 'mmm-yy'.freeze, + 18 => 'h:mm AM/PM'.freeze, + 19 => 'h:mm:ss AM/PM'.freeze, + 20 => 'h:mm'.freeze, + 21 => 'h:mm:ss'.freeze, + 22 => 'm/d/yy h:mm'.freeze, + 37 => '#,##0 ;(#,##0)'.freeze, + 38 => '#,##0 ;[Red](#,##0)'.freeze, + 39 => '#,##0.00;(#,##0.00)'.freeze, + 40 => '#,##0.00;[Red](#,##0.00)'.freeze, + 45 => 'mm:ss'.freeze, + 46 => '[h]:mm:ss'.freeze, + 47 => 'mmss.0'.freeze, + 48 => '##0.0E+0'.freeze, + 49 => '@'.freeze + } + + def to_type(format) + format = format.to_s.downcase + if (type = EXCEPTIONAL_FORMATS[format]) + type + elsif format.include?('#') + :float + elsif !format.match(/d+(?![\]])/).nil? || format.include?('y') + if format.include?('h') || format.include?('s') + :datetime + else + :date + end + elsif format.include?('h') || format.include?('s') + :time + elsif format.include?('%') + :percentage + else + :float + end + end + + module_function :to_type + end + end +end diff --git a/lib/roo/excelx/relationships.rb b/lib/roo/excelx/relationships.rb new file mode 100644 index 0000000..8a0ed97 --- /dev/null +++ b/lib/roo/excelx/relationships.rb @@ -0,0 +1,25 @@ +require 'roo/excelx/extractor' + +module Roo + class Excelx + class Relationships < Excelx::Extractor + def [](index) + to_a[index] + end + + def to_a + @relationships ||= extract_relationships + end + + private + + def extract_relationships + return [] unless doc_exists? + + Hash[doc.xpath('/Relationships/Relationship').map do |rel| + [rel.attribute('Id').text, rel] + end] + end + end + end +end diff --git a/lib/roo/excelx/shared.rb b/lib/roo/excelx/shared.rb new file mode 100644 index 0000000..3677fa2 --- /dev/null +++ b/lib/roo/excelx/shared.rb @@ -0,0 +1,32 @@ +module Roo + class Excelx + # Public: Shared class for allowing sheets to share data. This should + # reduce memory usage and reduce the number of objects being passed + # to various inititializers. + class Shared + attr_accessor :comments_files, :sheet_files, :rels_files + def initialize(dir) + @dir = dir + @comments_files = [] + @sheet_files = [] + @rels_files = [] + end + + def styles + @styles ||= Styles.new(File.join(@dir, 'roo_styles.xml')) + end + + def shared_strings + @shared_strings ||= SharedStrings.new(File.join(@dir, 'roo_sharedStrings.xml')) + end + + def workbook + @workbook ||= Workbook.new(File.join(@dir, 'roo_workbook.xml')) + end + + def base_date + workbook.base_date + end + end + end +end diff --git a/lib/roo/excelx/shared_strings.rb b/lib/roo/excelx/shared_strings.rb new file mode 100755 index 0000000..f7caf7c --- /dev/null +++ b/lib/roo/excelx/shared_strings.rb @@ -0,0 +1,157 @@ +require 'roo/excelx/extractor' + +module Roo + class Excelx + class SharedStrings < Excelx::Extractor + + COMMON_STRINGS = { + t: "t", + r: "r", + html_tag_open: "<html>", + html_tag_closed: "</html>" + } + + def [](index) + to_a[index] + end + + def to_a + @array ||= extract_shared_strings + end + + def to_html + @html ||= extract_html + end + + # Use to_html or to_a for html returns + # See what is happening with commit??? + def use_html?(index) + to_html[index][/<([biu]|sup|sub)>/] + end + + private + + def fix_invalid_shared_strings(doc) + invalid = { '_x000D_' => "\n" } + xml = doc.to_s + return doc unless xml[/#{invalid.keys.join('|')}/] + + ::Nokogiri::XML(xml.gsub(/#{invalid.keys.join('|')}/, invalid)) + end + + def extract_shared_strings + return [] unless doc_exists? + + document = fix_invalid_shared_strings(doc) + # read the shared strings xml document + document.xpath('/sst/si').map do |si| + shared_string = '' + si.children.each do |elem| + case elem.name + when 'r' + elem.children.each do |r_elem| + shared_string << r_elem.content if r_elem.name == 't' + end + when 't' + shared_string = elem.content + end + end + shared_string + end + end + + def extract_html + return [] unless doc_exists? + fix_invalid_shared_strings(doc) + # read the shared strings xml document + doc.xpath('/sst/si').map do |si| + html_string = '<html>' + si.children.each do |elem| + case elem.name + when 'r' + html_string << extract_html_r(elem) + when 't' + html_string << elem.content + end # case elem.name + end # si.children.each do |elem| + html_string << '</html>' + end # doc.xpath('/sst/si').map do |si| + end # def extract_html + + # The goal of this function is to take the following XML code snippet and create a html tag + # r_elem ::: XML Element that is in sharedStrings.xml of excel_book.xlsx + # {code:xml} + # <r> + # <rPr> + # <i/> + # <b/> + # <u/> + # <vertAlign val="subscript"/> + # <vertAlign val="superscript"/> + # </rPr> + # <t>TEXT</t> + # </r> + # {code} + # + # Expected Output ::: "<html><sub|sup><b><i><u>TEXT</u></i></b></sub|/sup></html>" + def extract_html_r(r_elem) + str = '' + xml_elems = { + sub: false, + sup: false, + b: false, + i: false, + u: false + } + b, i, u, sub, sup = false, false, false, false, false + r_elem.children.each do |elem| + case elem.name + when 'rPr' + elem.children.each do |rPr_elem| + case rPr_elem.name + when 'b' + # set formatting for Bold to true + xml_elems[:b] = true + when 'i' + # set formatting for Italics to true + xml_elems[:i] = true + when 'u' + # set formatting for Underline to true + xml_elems[:u] = true + when 'vertAlign' + # See if the Vertical Alignment is subscript or superscript + case rPr_elem.xpath('@val').first.value + when 'subscript' + # set formatting for Subscript to true and Superscript to false ... Can't have both + xml_elems[:sub] = true + xml_elems[:sup] = false + when 'superscript' + # set formatting for Superscript to true and Subscript to false ... Can't have both + xml_elems[:sup] = true + xml_elems[:sub] = false + end + end + end + when 't' + str << create_html(elem.content, xml_elems) + end + end + str + end # extract_html_r + + # This will return an html string + def create_html(text, formatting) + tmp_str = '' + formatting.each do |elem, val| + tmp_str << "<#{elem}>" if val + end + tmp_str << text + reverse_format = Hash[formatting.to_a.reverse] + reverse_format.each do |elem, val| + tmp_str << "</#{elem}>" if val + end + tmp_str + end + end # class SharedStrings < Excelx::Extractor + end # class Excelx +end # module Roo diff --git a/lib/roo/excelx/sheet.rb b/lib/roo/excelx/sheet.rb new file mode 100644 index 0000000..add92f0 --- /dev/null +++ b/lib/roo/excelx/sheet.rb @@ -0,0 +1,112 @@ +require 'forwardable' +module Roo + class Excelx + class Sheet + extend Forwardable + + delegate [:styles, :workbook, :shared_strings, :rels_files, :sheet_files, :comments_files] => :@shared + + def initialize(name, shared, sheet_index, options = {}) + @name = name + @shared = shared + @rels = Relationships.new(rels_files[sheet_index]) + @comments = Comments.new(comments_files[sheet_index]) + @sheet = SheetDoc.new(sheet_files[sheet_index], @rels, shared, options) + end + + def cells + @cells ||= @sheet.cells(@rels) + end + + def present_cells + @present_cells ||= cells.select { |_, cell| cell && !cell.empty? } + end + + # Yield each row as array of Excelx::Cell objects + # accepts options max_rows (int) (offset by 1 for header), + # pad_cells (boolean) and offset (int) + def each_row(options = {}, &block) + row_count = 0 + options[:offset] ||= 0 + @sheet.each_row_streaming do |row| + break if options[:max_rows] && row_count == options[:max_rows] + options[:offset] + 1 + if block_given? && !(options[:offset] && row_count < options[:offset]) + block.call(cells_for_row_element(row, options)) + end + row_count += 1 + end + end + + def row(row_number) + first_column.upto(last_column).map do |col| + cells[[row_number, col]] + end.map { |cell| cell && cell.value } + end + + def column(col_number) + first_row.upto(last_row).map do |row| + cells[[row, col_number]] + end.map { |cell| cell && cell.value } + end + + # returns the number of the first non-empty row + def first_row + @first_row ||= present_cells.keys.map { |row, _| row }.min + end + + def last_row + @last_row ||= present_cells.keys.map { |row, _| row }.max + end + + # returns the number of the first non-empty column + def first_column + @first_column ||= present_cells.keys.map { |_, col| col }.min + end + + # returns the number of the last non-empty column + def last_column + @last_column ||= present_cells.keys.map { |_, col| col }.max + end + + def excelx_format(key) + cell = cells[key] + styles.style_format(cell.style).to_s if cell + end + + def hyperlinks + @hyperlinks ||= @sheet.hyperlinks(@rels) + end + + def comments + @comments.comments + end + + def dimensions + @sheet.dimensions + end + + private + + # Take an xml row and return an array of Excelx::Cell objects + # optionally pad array to header width(assumed 1st row). + # takes option pad_cells (boolean) defaults false + def cells_for_row_element(row_element, options = {}) + return [] unless row_element + cell_col = 0 + cells = [] + @sheet.each_cell(row_element) do |cell| + cells.concat(pad_cells(cell, cell_col)) if options[:pad_cells] + cells << cell + cell_col = cell.coordinate.column + end + cells + end + + def pad_cells(cell, last_column) + pad = [] + (cell.coordinate.column - 1 - last_column).times { pad << nil } + pad + end + end + end +end diff --git a/lib/roo/excelx/sheet_doc.rb b/lib/roo/excelx/sheet_doc.rb new file mode 100755 index 0000000..a705958 --- /dev/null +++ b/lib/roo/excelx/sheet_doc.rb @@ -0,0 +1,211 @@ +require 'forwardable' +require 'roo/excelx/extractor' + +module Roo + class Excelx + class SheetDoc < Excelx::Extractor + extend Forwardable + delegate [:styles, :workbook, :shared_strings, :base_date] => :@shared + + def initialize(path, relationships, shared, options = {}) + super(path) + @shared = shared + @options = options + @relationships = relationships + end + + def cells(relationships) + @cells ||= extract_cells(relationships) + end + + def hyperlinks(relationships) + @hyperlinks ||= extract_hyperlinks(relationships) + end + + # Get the dimensions for the sheet. + # This is the upper bound of cells that might + # be parsed. (the document may be sparse so cell count is only upper bound) + def dimensions + @dimensions ||= extract_dimensions + end + + # Yield each row xml element to caller + def each_row_streaming(&block) + Roo::Utils.each_element(@path, 'row', &block) + end + + # Yield each cell as Excelx::Cell to caller for given + # row xml + def each_cell(row_xml) + return [] unless row_xml + row_xml.children.each do |cell_element| + # If you're sure you're not going to need this hyperlinks you can discard it + hyperlinks = unless @options[:no_hyperlinks] + key = ::Roo::Utils.ref_to_key(cell_element['r']) + hyperlinks(@relationships)[key] + end + + yield cell_from_xml(cell_element, hyperlinks) + end + end + + private + + def cell_value_type(type, format) + case type + when 's'.freeze + :shared + when 'b'.freeze + :boolean + when 'str'.freeze + :string + when 'inlineStr'.freeze + :inlinestr + else + Excelx::Format.to_type(format) + end + end + + # Internal: Creates a cell based on an XML clell.. + # + # cell_xml - a Nokogiri::XML::Element. e.g. + # <c r="A5" s="2"> + # <v>22606</v> + # </c> + # hyperlink - a String for the hyperlink for the cell or nil when no + # hyperlink is present. + # + # Examples + # + # cells_from_xml(<Nokogiri::XML::Element>, nil) + # # => <Excelx::Cell::String> + # + # Returns a type of <Excelx::Cell>. + def cell_from_xml(cell_xml, hyperlink) + coordinate = extract_coordinate(cell_xml['r']) + return Excelx::Cell::Empty.new(coordinate) if cell_xml.children.empty? + + # NOTE: This is error prone, to_i will silently turn a nil into a 0. + # This works by coincidence because Format[0] is General. + style = cell_xml['s'].to_i + format = styles.style_format(style) + value_type = cell_value_type(cell_xml['t'], format) + formula = nil + + cell_xml.children.each do |cell| + case cell.name + when 'is' + content_arr = cell.search('t').map(&:content) + unless content_arr.empty? + return Excelx::Cell.create_cell(:string, content_arr.join(''), formula, style, hyperlink, coordinate) + end + when 'f' + formula = cell.content + when 'v' + return create_cell_from_value(value_type, cell, formula, format, style, hyperlink, base_date, coordinate) + end + end + + Excelx::Cell::Empty.new(coordinate) + end + + def create_cell_from_value(value_type, cell, formula, format, style, hyperlink, base_date, coordinate) + # NOTE: format.to_s can replace excelx_type as an argument for + # Cell::Time, Cell::DateTime, Cell::Date or Cell::Number, but + # it will break some brittle tests. + excelx_type = [:numeric_or_formula, format.to_s] + + # NOTE: There are only a few situations where value != cell.content + # 1. when a sharedString is used. value = sharedString; + # cell.content = id of sharedString + # 2. boolean cells: value = 'TRUE' | 'FALSE'; cell.content = '0' | '1'; + # But a boolean cell should use TRUE|FALSE as the formatted value + # and use a Boolean for it's value. Using a Boolean value breaks + # Roo::Base#to_csv. + # 3. formula + case value_type + when :shared + value = shared_strings.use_html?(cell.content.to_i) ? shared_strings.to_html[cell.content.to_i] : shared_strings[cell.content.to_i] + Excelx::Cell.create_cell(:string, value, formula, style, hyperlink, coordinate) + when :boolean, :string + value = cell.content + Excelx::Cell.create_cell(value_type, value, formula, style, hyperlink, coordinate) + when :time, :datetime + cell_content = cell.content.to_f + # NOTE: A date will be a whole number. A time will have be > 1. And + # in general, a datetime will have decimals. But if the cell is + # using a custom format, it's possible to be interpreted incorrectly. + # cell_content.to_i == cell_content && standard_style?=> :date + # + # Should check to see if the format is standard or not. If it's a + # standard format, than it's a date, otherwise, it is a datetime. + # @styles.standard_style?(style_id) + # STANDARD_STYLES.keys.include?(style_id.to_i) + cell_type = if cell_content < 1.0 + :time + elsif (cell_content - cell_content.floor).abs > 0.000001 + :datetime + else + :date + end + Excelx::Cell.create_cell(cell_type, cell.content, formula, excelx_type, style, hyperlink, base_date, coordinate) + when :date + Excelx::Cell.create_cell(value_type, cell.content, formula, excelx_type, style, hyperlink, base_date, coordinate) + else + Excelx::Cell.create_cell(:number, cell.content, formula, excelx_type, style, hyperlink, coordinate) + end + end + + def extract_coordinate(coordinate) + row, column = ::Roo::Utils.split_coordinate(coordinate) + + Excelx::Coordinate.new(row, column) + end + + def extract_hyperlinks(relationships) + return {} unless (hyperlinks = doc.xpath('/worksheet/hyperlinks/hyperlink')) + + Hash[hyperlinks.map do |hyperlink| + if hyperlink.attribute('id') && (relationship = relationships[hyperlink.attribute('id').text]) + [::Roo::Utils.ref_to_key(hyperlink.attributes['ref'].to_s), relationship.attribute('Target').text] + end + end.compact] + end + + def expand_merged_ranges(cells) + # Extract merged ranges from xml + merges = {} + doc.xpath('/worksheet/mergeCells/mergeCell').each do |mergecell_xml| + tl, br = mergecell_xml['ref'].split(/:/).map { |ref| ::Roo::Utils.ref_to_key(ref) } + for row in tl[0]..br[0] do + for col in tl[1]..br[1] do + next if row == tl[0] && col == tl[1] + merges[[row, col]] = tl + end + end + end + # Duplicate value into all cells in merged range + merges.each do |dst, src| + cells[dst] = cells[src] + end + end + + def extract_cells(relationships) + extracted_cells = Hash[doc.xpath('/worksheet/sheetData/row/c').map do |cell_xml| + key = ::Roo::Utils.ref_to_key(cell_xml['r']) + [key, cell_from_xml(cell_xml, hyperlinks(relationships)[key])] + end] + + expand_merged_ranges(extracted_cells) if @options[:expand_merged_ranges] + + extracted_cells + end + + def extract_dimensions + Roo::Utils.each_element(@path, 'dimension') do |dimension| + return dimension.attributes['ref'].value + end + end + end + end +end diff --git a/lib/roo/excelx/styles.rb b/lib/roo/excelx/styles.rb new file mode 100644 index 0000000..87f1713 --- /dev/null +++ b/lib/roo/excelx/styles.rb @@ -0,0 +1,64 @@ +require 'roo/font' +require 'roo/excelx/extractor' + +module Roo + class Excelx + class Styles < Excelx::Extractor + # convert internal excelx attribute to a format + def style_format(style) + id = num_fmt_ids[style.to_i] + num_fmts[id] || Excelx::Format::STANDARD_FORMATS[id.to_i] + end + + def definitions + @definitions ||= extract_definitions + end + + private + + def num_fmt_ids + @num_fmt_ids ||= extract_num_fmt_ids + end + + def num_fmts + @num_fmts ||= extract_num_fmts + end + + def fonts + @fonts ||= extract_fonts + end + + def extract_definitions + doc.xpath('//cellXfs').flat_map do |xfs| + xfs.children.map do |xf| + fonts[xf['fontId'].to_i] + end + end + end + + def extract_fonts + doc.xpath('//fonts/font').map do |font_el| + Font.new.tap do |font| + font.bold = !font_el.xpath('./b').empty? + font.italic = !font_el.xpath('./i').empty? + font.underline = !font_el.xpath('./u').empty? + end + end + end + + def extract_num_fmt_ids + doc.xpath('//cellXfs').flat_map do |xfs| + xfs.children.map do |xf| + xf['numFmtId'] + end + end.compact + end + + def extract_num_fmts + Hash[doc.xpath('//numFmt').map do |num_fmt| + [num_fmt['numFmtId'], num_fmt['formatCode']] + end] + end + end + end +end diff --git a/lib/roo/excelx/workbook.rb b/lib/roo/excelx/workbook.rb new file mode 100644 index 0000000..7ef841f --- /dev/null +++ b/lib/roo/excelx/workbook.rb @@ -0,0 +1,59 @@ +require 'roo/excelx/extractor' + +module Roo + class Excelx + class Workbook < Excelx::Extractor + class Label + attr_reader :sheet, :row, :col, :name + + def initialize(name, sheet, row, col) + @name = name + @sheet = sheet + @row = row.to_i + @col = ::Roo::Utils.letter_to_number(col) + end + + def key + [@row, @col] + end + end + + def initialize(path) + super + fail ArgumentError, 'missing required workbook file' unless doc_exists? + end + + def sheets + doc.xpath('//sheet') + end + + # aka labels + def defined_names + Hash[doc.xpath('//definedName').map do |defined_name| + # "Sheet1!$C$5" + sheet, coordinates = defined_name.text.split('!$', 2) + col, row = coordinates.split('$') + name = defined_name['name'] + [name, Label.new(name, sheet, row, col)] + end] + end + + def base_date + @base_date ||= + begin + # Default to 1900 (minus one day due to excel quirk) but use 1904 if + # it's set in the Workbook's workbookPr + # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx + result = Date.new(1899, 12, 30) # default + doc.css('workbookPr[date1904]').each do |workbookPr| + if workbookPr['date1904'] =~ /true|1/i + result = Date.new(1904, 01, 01) + break + end + end + result + end + end + end + end +end diff --git a/lib/roo/font.rb b/lib/roo/font.rb new file mode 100644 index 0000000..40e9c55 --- /dev/null +++ b/lib/roo/font.rb @@ -0,0 +1,17 @@ +module Roo + class Font + attr_accessor :bold, :italic, :underline + + def bold? + @bold + end + + def italic? + @italic + end + + def underline? + @underline + end + end +end diff --git a/lib/roo/formatters/base.rb b/lib/roo/formatters/base.rb new file mode 100644 index 0000000..2d32d6d --- /dev/null +++ b/lib/roo/formatters/base.rb @@ -0,0 +1,15 @@ +module Roo + module Formatters + module Base + # converts an integer value to a time string like '02:05:06' + def integer_to_timestring(content) + h = (content / 3600.0).floor + content -= h * 3600 + m = (content / 60.0).floor + content -= m * 60 + s = content + Kernel.format("%02d:%02d:%02d", h, m, s) + end + end + end +end diff --git a/lib/roo/formatters/csv.rb b/lib/roo/formatters/csv.rb new file mode 100644 index 0000000..3181f09 --- /dev/null +++ b/lib/roo/formatters/csv.rb @@ -0,0 +1,84 @@ +module Roo + module Formatters + module CSV + def to_csv(filename = nil, separator = ",", sheet = default_sheet) + if filename + File.open(filename, "w") do |file| + write_csv_content(file, sheet, separator) + end + true + else + sio = ::StringIO.new + write_csv_content(sio, sheet, separator) + sio.rewind + sio.read + end + end + + private + + # Write all cells to the csv file. File can be a filename or nil. If the + # file argument is nil the output goes to STDOUT + def write_csv_content(file = nil, sheet = nil, separator = ",") + file ||= STDOUT + return unless first_row(sheet) # The sheet is empty + + 1.upto(last_row(sheet)) do |row| + 1.upto(last_column(sheet)) do |col| + # TODO: use CSV.generate_line + file.print(separator) if col > 1 + file.print cell_to_csv(row, col, sheet) + end + file.print("\n") + end + end + + # The content of a cell in the csv output + def cell_to_csv(row, col, sheet) + return "" if empty?(row, col, sheet) + + onecell = cell(row, col, sheet) + + case celltype(row, col, sheet) + when :string + %("#{onecell.gsub('"', '""')}") unless onecell.empty? + when :boolean + # TODO: this only works for excelx + onecell = self.sheet_for(sheet).cells[[row, col]].formatted_value + %("#{onecell.gsub('"', '""').downcase}") + when :float, :percentage + if onecell == onecell.to_i + onecell.to_i.to_s + else + onecell.to_s + end + when :formula + case onecell + when String + %("#{onecell.gsub('"', '""')}") unless onecell.empty? + when Integer + onecell.to_s + when Float + if onecell == onecell.to_i + onecell.to_i.to_s + else + onecell.to_s + end + when Date, DateTime, TrueClass, FalseClass + onecell.to_s + else + fail "unhandled onecell-class #{onecell.class}" + end + when :date, :datetime + onecell.to_s + when :time + integer_to_timestring(onecell) + when :link + %("#{onecell.url.gsub('"', '""')}") + else + fail "unhandled celltype #{celltype(row, col, sheet)}" + end || "" + end + end + end +end diff --git a/lib/roo/formatters/matrix.rb b/lib/roo/formatters/matrix.rb new file mode 100644 index 0000000..39c7381 --- /dev/null +++ b/lib/roo/formatters/matrix.rb @@ -0,0 +1,23 @@ +module Roo + module Formatters + module Matrix + # returns a matrix object from the whole sheet or a rectangular area of a sheet + def to_matrix(from_row = nil, from_column = nil, to_row = nil, to_column = nil, sheet = default_sheet) + require 'matrix' + + return ::Matrix.empty unless first_row + + from_row ||= first_row(sheet) + to_row ||= last_row(sheet) + from_column ||= first_column(sheet) + to_column ||= last_column(sheet) + + ::Matrix.rows(from_row.upto(to_row).map do |row| + from_column.upto(to_column).map do |col| + cell(row, col, sheet) + end + end) + end + end + end +end diff --git a/lib/roo/formatters/xml.rb b/lib/roo/formatters/xml.rb new file mode 100644 index 0000000..ebb8cde --- /dev/null +++ b/lib/roo/formatters/xml.rb @@ -0,0 +1,31 @@ +# returns an XML representation of all sheets of a spreadsheet file +module Roo + module Formatters + module XML + def to_xml + Nokogiri::XML::Builder.new do |xml| + xml.spreadsheet do + sheets.each do |sheet| + self.default_sheet = sheet + xml.sheet(name: sheet) do |x| + if first_row && last_row && first_column && last_column + # sonst gibt es Fehler bei leeren Blaettern + first_row.upto(last_row) do |row| + first_column.upto(last_column) do |col| + next if empty?(row, col) + + x.cell(cell(row, col), + row: row, + column: col, + type: celltype(row, col)) + end + end + end + end + end + end + end.to_xml + end + end + end +end diff --git a/lib/roo/formatters/yaml.rb b/lib/roo/formatters/yaml.rb new file mode 100644 index 0000000..69ef3ab --- /dev/null +++ b/lib/roo/formatters/yaml.rb @@ -0,0 +1,40 @@ +module Roo + module Formatters + module YAML + # returns a rectangular area (default: all cells) as yaml-output + # you can add additional attributes with the prefix parameter like: + # oo.to_yaml({"file"=>"flightdata_2007-06-26", "sheet" => "1"}) + def to_yaml(prefix = {}, from_row = nil, from_column = nil, to_row = nil, to_column = nil, sheet = default_sheet) + # return an empty string if there is no first_row, i.e. the sheet is empty + return "" unless first_row + + from_row ||= first_row(sheet) + to_row ||= last_row(sheet) + from_column ||= first_column(sheet) + to_column ||= last_column(sheet) + + result = "--- \n" + from_row.upto(to_row) do |row| + from_column.upto(to_column) do |col| + next if empty?(row, col, sheet) + + result << "cell_#{row}_#{col}: \n" + prefix.each do|k, v| + result << " #{k}: #{v} \n" + end + result << " row: #{row} \n" + result << " col: #{col} \n" + result << " celltype: #{celltype(row, col, sheet)} \n" + value = cell(row, col, sheet) + if celltype(row, col, sheet) == :time + value = integer_to_timestring(value) + end + result << " value: #{value} \n" + end + end + + result + end + end + end +end diff --git a/lib/roo/libre_office.rb b/lib/roo/libre_office.rb new file mode 100644 index 0000000..6aa2332 --- /dev/null +++ b/lib/roo/libre_office.rb @@ -0,0 +1,4 @@ +require 'roo/open_office' + +# LibreOffice is just an alias for Roo::OpenOffice class +Roo::LibreOffice = Roo::OpenOffice diff --git a/lib/roo/link.rb b/lib/roo/link.rb new file mode 100644 index 0000000..72dc143 --- /dev/null +++ b/lib/roo/link.rb @@ -0,0 +1,34 @@ +require 'uri' + +module Roo + class Link < String + # FIXME: Roo::Link inherits from String. A link cell is_a?(Roo::Link). **It is + # the only situation where a cells `value` is always a String**. Link + # cells have a nifty `to_uri` method, but this method isn't easily + # reached. (e.g. `sheet.sheet_for(nil).cells[[row,column]]).value.to_uri`; + # `sheet.hyperlink(row, column)` doesn't use `to_uri`). + # + # 1. Add different types of links (String, Numeric, Date, DateTime, etc.) + # 2. Remove Roo::Link. + # 3. Don't inherit the string and pass the cell's value. + # + # I don't know the historical reasons for the Roo::Link, but right now + # it seems uneccessary. I'm in favor of keeping it just in case. + # + # I'm also in favor of passing the cell's value to Roo::Link. The + # cell.value's class would still be Roo::Link, but the value itself + # would depend on what type of cell it is (Numeric, Date, etc.). + # + attr_reader :href + alias_method :url, :href + + def initialize(href = '', text = href) + super(text) + @href = href + end + + def to_uri + URI.parse href + end + end +end diff --git a/lib/roo/open_office.rb b/lib/roo/open_office.rb new file mode 100644 index 0000000..6ccbe85 --- /dev/null +++ b/lib/roo/open_office.rb @@ -0,0 +1,626 @@ +require 'date' +require 'nokogiri' +require 'cgi' +require 'zip/filesystem' +require 'roo/font' +require 'roo/tempdir' +require 'base64' +require 'openssl' + +module Roo + class OpenOffice < Roo::Base + extend Roo::Tempdir + + ERROR_MISSING_CONTENT_XML = 'file missing required content.xml'.freeze + XPATH_FIND_TABLE_STYLES = "//*[local-name()='automatic-styles']".freeze + XPATH_LOCAL_NAME_TABLE = "//*[local-name()='table']".freeze + + # initialization and opening of a spreadsheet file + # values for packed: :zip + def initialize(filename, options = {}) + packed = options[:packed] + file_warning = options[:file_warning] || :error + + @only_visible_sheets = options[:only_visible_sheets] + file_type_check(filename, '.ods', 'an Roo::OpenOffice', file_warning, packed) + # NOTE: Create temp directory and allow Ruby to cleanup the temp directory + # when the object is garbage collected. Initially, the finalizer was + # created in the Roo::Tempdir module, but that led to a segfault + # when testing in Ruby 2.4.0. + @tmpdir = self.class.make_tempdir(self, find_basename(filename), options[:tmpdir_root]) + ObjectSpace.define_finalizer(self, self.class.finalize(object_id)) + @filename = local_filename(filename, @tmpdir, packed) + # TODO: @cells_read[:default] = false + open_oo_file(options) + super(filename, options) + initialize_default_variables + + unless @table_display.any? + doc.xpath(XPATH_FIND_TABLE_STYLES).each do |style| + read_table_styles(style) + end + end + + @sheet_names = doc.xpath(XPATH_LOCAL_NAME_TABLE).map do |sheet| + if !@only_visible_sheets || @table_display[attribute(sheet, 'style-name')] + sheet.attributes['name'].value + end + end.compact + rescue + self.class.finalize_tempdirs(object_id) + raise + end + + def open_oo_file(options) + Zip::File.open(@filename) do |zip_file| + content_entry = zip_file.glob('content.xml').first + fail ArgumentError, ERROR_MISSING_CONTENT_XML unless content_entry + + roo_content_xml_path = ::File.join(@tmpdir, 'roo_content.xml') + content_entry.extract(roo_content_xml_path) + decrypt_if_necessary(zip_file, content_entry, roo_content_xml_path, options) + end + end + + def initialize_default_variables + @formula = {} + @style = {} + @style_defaults = Hash.new { |h, k| h[k] = [] } + @table_display = Hash.new { |h, k| h[k] = true } + @font_style_definitions = {} + @comment = {} + @comments_read = {} + end + + def method_missing(m, *args) + read_labels + # is method name a label name + if @label.key?(m.to_s) + row, col = label(m.to_s) + cell(row, col) + else + # call super for methods like #a1 + super + end + end + + # Returns the content of a spreadsheet-cell. + # (1,1) is the upper left corner. + # (1,1), (1,'A'), ('A',1), ('a',1) all refers to the + # cell at the first line and first row. + def cell(row, col, sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + row, col = normalize(row, col) + if celltype(row, col, sheet) == :date + yyyy, mm, dd = @cell[sheet][[row, col]].to_s.split('-') + return Date.new(yyyy.to_i, mm.to_i, dd.to_i) + end + + @cell[sheet][[row, col]] + end + + # Returns the formula at (row,col). + # Returns nil if there is no formula. + # The method #formula? checks if there is a formula. + def formula(row, col, sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + row, col = normalize(row, col) + @formula[sheet][[row, col]] + end + + # Predicate methods really should return a boolean + # value. Hopefully no one was relying on the fact that this + # previously returned either nil/formula + def formula?(*args) + !!formula(*args) + end + + # returns each formula in the selected sheet as an array of elements + # [row, col, formula] + def formulas(sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + return [] unless @formula[sheet] + @formula[sheet].each.collect do |elem| + [elem[0][0], elem[0][1], elem[1]] + end + end + + # Given a cell, return the cell's style + def font(row, col, sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + row, col = normalize(row, col) + style_name = @style[sheet][[row, col]] || @style_defaults[sheet][col - 1] || 'Default' + @font_style_definitions[style_name] + end + + # returns the type of a cell: + # * :float + # * :string + # * :date + # * :percentage + # * :formula + # * :time + # * :datetime + def celltype(row, col, sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + row, col = normalize(row, col) + @formula[sheet][[row, col]] ? :formula : @cell_type[sheet][[row, col]] + end + + def sheets + @sheet_names + end + + # version of the Roo::OpenOffice document + # at 2007 this is always "1.0" + def officeversion + oo_version + @officeversion + end + + # shows the internal representation of all cells + # mainly for debugging purposes + def to_s(sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + @cell[sheet].inspect + end + + # returns the row,col values of the labelled cell + # (nil,nil) if label is not defined + def label(labelname) + read_labels + return [nil, nil, nil] if @label.size < 1 || !@label.key?(labelname) + [ + @label[labelname][1].to_i, + ::Roo::Utils.letter_to_number(@label[labelname][2]), + @label[labelname][0] + ] + end + + # Returns an array which all labels. Each element is an array with + # [labelname, [row,col,sheetname]] + def labels(_sheet = nil) + read_labels + @label.map do |label| + [label[0], # name + [label[1][1].to_i, # row + ::Roo::Utils.letter_to_number(label[1][2]), # column + label[1][0], # sheet + ]] + end + end + + # returns the comment at (row/col) + # nil if there is no comment + def comment(row, col, sheet = nil) + sheet ||= default_sheet + read_cells(sheet) + row, col = normalize(row, col) + return nil unless @comment[sheet] + @comment[sheet][[row, col]] + end + + # returns each comment in the selected sheet as an array of elements + # [row, col, comment] + def comments(sheet = nil) + sheet ||= default_sheet + read_comments(sheet) unless @comments_read[sheet] + return [] unless @comment[sheet] + @comment[sheet].each.collect do |elem| + [elem[0][0], elem[0][1], elem[1]] + end + end + + private + + # If the ODS file has an encryption-data element, then try to decrypt. + # If successful, the temporary content.xml will be overwritten with + # decrypted contents. + def decrypt_if_necessary( + zip_file, + content_entry, + roo_content_xml_path, options + ) + # Check if content.xml is encrypted by extracting manifest.xml + # and searching for a manifest:encryption-data element + + if (manifest_entry = zip_file.glob('META-INF/manifest.xml').first) + roo_manifest_xml_path = File.join(@tmpdir, 'roo_manifest.xml') + manifest_entry.extract(roo_manifest_xml_path) + manifest = ::Roo::Utils.load_xml(roo_manifest_xml_path) + + # XPath search for manifest:encryption-data only for the content.xml + # file + + encryption_data = manifest.xpath( + "//manifest:file-entry[@manifest:full-path='content.xml']"\ + "/manifest:encryption-data" + ).first + + # If XPath returns a node, then we know content.xml is encrypted + + unless encryption_data.nil? + + # Since we know it's encrypted, we check for the password option + # and if it doesn't exist, raise an argument error + + password = options[:password] + if !password.nil? + perform_decryption( + encryption_data, + password, + content_entry, + roo_content_xml_path + ) + else + fail ArgumentError, 'file is encrypted but password was not supplied' + end + end + else + fail ArgumentError, 'file missing required META-INF/manifest.xml' + end + end + + # Process the ODS encryption manifest and perform the decryption + def perform_decryption( + encryption_data, + password, + content_entry, + roo_content_xml_path + ) + # Extract various expected attributes from the manifest that + # describe the encryption + + algorithm_node = encryption_data.xpath('manifest:algorithm').first + key_derivation_node = + encryption_data.xpath('manifest:key-derivation').first + start_key_generation_node = + encryption_data.xpath('manifest:start-key-generation').first + + # If we have all the expected elements, then we can perform + # the decryption. + + if !algorithm_node.nil? && !key_derivation_node.nil? && + !start_key_generation_node.nil? + + # The algorithm is a URI describing the algorithm used + algorithm = algorithm_node['manifest:algorithm-name'] + + # The initialization vector is base-64 encoded + iv = Base64.decode64( + algorithm_node['manifest:initialisation-vector'] + ) + key_derivation_name = key_derivation_node['manifest:key-derivation-name'] + iteration_count = key_derivation_node['manifest:iteration-count'].to_i + salt = Base64.decode64(key_derivation_node['manifest:salt']) + + # The key is hashed with an algorithm represented by this URI + key_generation_name = + start_key_generation_node[ + 'manifest:start-key-generation-name' + ] + + hashed_password = password + + if key_generation_name == 'http://www.w3.org/2000/09/xmldsig#sha256' + + hashed_password = Digest::SHA256.digest(password) + else + fail ArgumentError, "Unknown key generation algorithm #{key_generation_name}" + end + + cipher = find_cipher( + algorithm, + key_derivation_name, + hashed_password, + salt, + iteration_count, + iv + ) + + begin + decrypted = decrypt(content_entry, cipher) + + # Finally, inflate the decrypted stream and overwrite + # content.xml + IO.binwrite( + roo_content_xml_path, + Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(decrypted) + ) + rescue StandardError => error + raise ArgumentError, "Invalid password or other data error: #{error}" + end + else + fail ArgumentError, 'manifest.xml missing encryption-data elements' + end + end + + # Create a cipher based on an ODS algorithm URI from manifest.xml + # params: algorithm, key_derivation_name, hashed_password, salt, iteration_count, iv + def find_cipher(*args) + fail ArgumentError, 'Unknown algorithm ' + algorithm unless args[0] == 'http://www.w3.org/2001/04/xmlenc#aes256-cbc' + + cipher = ::OpenSSL::Cipher.new('AES-256-CBC') + cipher.decrypt + cipher.padding = 0 + cipher.key = find_cipher_key(cipher, *args[1..4]) + cipher.iv = args[5] + + cipher + end + + # Create a cipher key based on an ODS algorithm string from manifest.xml + def find_cipher_key(*args) + fail ArgumentError, 'Unknown key derivation name ', args[1] unless args[1] == 'PBKDF2' + + ::OpenSSL::PKCS5.pbkdf2_hmac_sha1(args[2], args[3], args[4], args[0].key_len) + end + + # Block decrypt raw bytes from the zip file based on the cipher + def decrypt(content_entry, cipher) + # Zip::Entry.extract writes a 0-length file when trying + # to extract an encrypted stream, so we read the + # raw bytes based on the offset and lengths + decrypted = '' + File.open(@filename, 'rb') do |zipfile| + zipfile.seek( + content_entry.local_header_offset + + content_entry.calculate_local_header_size + ) + total_to_read = content_entry.compressed_size + + block_size = 4096 + block_size = total_to_read if block_size > total_to_read + + while (buffer = zipfile.read(block_size)) + decrypted += cipher.update(buffer) + total_to_read -= buffer.length + + break if total_to_read == 0 + + block_size = total_to_read if block_size > total_to_read + end + end + + decrypted + cipher.final + end + + def doc + @doc ||= ::Roo::Utils.load_xml(File.join(@tmpdir, 'roo_content.xml')) + end + + # read the version of the OO-Version + def oo_version + doc.xpath("//*[local-name()='document-content']").each do |office| + @officeversion = attribute(office, 'version') + end + end + + # helper function to set the internal representation of cells + def set_cell_values(sheet, x, y, i, v, value_type, formula, table_cell, str_v, style_name) + key = [y, x + i] + @cell_type[sheet] ||= {} + @cell_type[sheet][key] = value_type.to_sym if value_type + @formula[sheet] ||= {} + if formula + ['of:', 'oooc:'].each do |prefix| + if formula[0, prefix.length] == prefix + formula = formula[prefix.length..-1] + end + end + @formula[sheet][key] = formula + end + @cell[sheet] ||= {} + @style[sheet] ||= {} + @style[sheet][key] = style_name + case @cell_type[sheet][key] + when :float + @cell[sheet][key] = (table_cell.attributes['value'].to_s.include?(".") || table_cell.children.first.text.include?(".")) ? v.to_f : v.to_i + when :percentage + @cell[sheet][key] = v.to_f + when :string + @cell[sheet][key] = str_v + when :date + # TODO: if table_cell.attributes['date-value'].size != "XXXX-XX-XX".size + if attribute(table_cell, 'date-value').size != 'XXXX-XX-XX'.size + #-- dann ist noch eine Uhrzeit vorhanden + #-- "1961-11-21T12:17:18" + @cell[sheet][key] = DateTime.parse(attribute(table_cell, 'date-value').to_s) + @cell_type[sheet][key] = :datetime + else + @cell[sheet][key] = table_cell.attributes['date-value'] + end + when :time + hms = v.split(':') + @cell[sheet][key] = hms[0].to_i * 3600 + hms[1].to_i * 60 + hms[2].to_i + else + @cell[sheet][key] = v + end + end + + # read all cells in the selected sheet + #-- + # the following construct means '4 blanks' + # some content <text:s text:c="3"/> + #++ + def read_cells(sheet = default_sheet) + validate_sheet!(sheet) + return if @cells_read[sheet] + + sheet_found = false + doc.xpath("//*[local-name()='table']").each do |ws| + next unless sheet == attribute(ws, 'name') + + sheet_found = true + col = 1 + row = 1 + ws.children.each do |table_element| + case table_element.name + when 'table-column' + @style_defaults[sheet] << table_element.attributes['default-cell-style-name'] + when 'table-row' + if table_element.attributes['number-rows-repeated'] + skip_row = attribute(table_element, 'number-rows-repeated').to_s.to_i + row = row + skip_row - 1 + end + table_element.children.each do |cell| + skip_col = attribute(cell, 'number-columns-repeated') + formula = attribute(cell, 'formula') + value_type = attribute(cell, 'value-type') + v = attribute(cell, 'value') + style_name = attribute(cell, 'style-name') + case value_type + when 'string' + str_v = '' + # insert \n if there is more than one paragraph + para_count = 0 + cell.children.each do |str| + # begin comments + #=begin + #- <table:table-cell office:value-type="string"> + # - <office:annotation office:display="true" draw:style-name="gr1" draw:text-style-name="P1" svg:width="1.1413in" svg:height="0.3902in" svg:x="2.0142in" svg:y="0in" draw:caption-point-x="-0.2402in" draw:caption-point-y="0.5661in"> + # <dc:date>2011-09-20T00:00:00</dc:date> + # <text:p text:style-name="P1">Kommentar fuer B4</text:p> + # </office:annotation> + # <text:p>B4 (mit Kommentar)</text:p> + # </table:table-cell> + #=end + if str.name == 'annotation' + str.children.each do |annotation| + next unless annotation.name == 'p' + # @comment ist ein Hash mit Sheet als Key (wie bei @cell) + # innerhalb eines Elements besteht ein Eintrag aus einem + # weiteren Hash mit Key [row,col] und dem eigentlichen + # Kommentartext als Inhalt + @comment[sheet] = Hash.new unless @comment[sheet] + key = [row, col] + @comment[sheet][key] = annotation.text + end + end + # end comments + if str.name == 'p' + v = str.content + str_v += "\n" if para_count > 0 + para_count += 1 + if str.children.size > 1 + str_v += children_to_string(str.children) + else + str.children.each do |child| + str_v += child.content #.text + end + end + str_v.gsub!(/'/, "'") # special case not supported by unescapeHTML + str_v = CGI.unescapeHTML(str_v) + end # == 'p' + end + when 'time' + cell.children.each do |str| + v = str.content if str.name == 'p' + end + when '', nil, 'date', 'percentage', 'float' + # + when 'boolean' + v = attribute(cell, 'boolean-value').to_s + end + if skip_col + if !v.nil? || cell.attributes['date-value'] + 0.upto(skip_col.to_i - 1) do |i| + set_cell_values(sheet, col, row, i, v, value_type, formula, cell, str_v, style_name) + end + end + col += (skip_col.to_i - 1) + end # if skip + set_cell_values(sheet, col, row, 0, v, value_type, formula, cell, str_v, style_name) + col += 1 + end + row += 1 + col = 1 + end + end + end + doc.xpath("//*[local-name()='automatic-styles']").each do |style| + read_styles(style) + end + + fail RangeError unless sheet_found + + @cells_read[sheet] = true + @comments_read[sheet] = true + end + + # Only calls read_cells because Roo::Base calls read_comments + # whereas the reading of comments is done in read_cells for Roo::OpenOffice-objects + def read_comments(sheet = nil) + read_cells(sheet) + end + + def read_labels + @label ||= Hash[doc.xpath('//table:named-range').map do |ne| + #- + # $Sheet1.$C$5 + #+ + name = attribute(ne, 'name').to_s + sheetname, coords = attribute(ne, 'cell-range-address').to_s.split('.$') + col, row = coords.split('$') + sheetname = sheetname[1..-1] if sheetname[0, 1] == '$' + [name, [sheetname, row, col]] + end] + end + + def read_styles(style_elements) + @font_style_definitions['Default'] = Roo::Font.new + style_elements.each do |style| + next unless style.name == 'style' + style_name = attribute(style, 'name') + style.each do |properties| + font = Roo::OpenOffice::Font.new + font.bold = attribute(properties, 'font-weight') + font.italic = attribute(properties, 'font-style') + font.underline = attribute(properties, 'text-underline-style') + @font_style_definitions[style_name] = font + end + end + end + + def read_table_styles(styles) + styles.children.each do |style| + next unless style.name == 'style' + style_name = attribute(style, 'name') + style.children.each do |properties| + display = attribute(properties, 'display') + next unless display + @table_display[style_name] = (display == 'true') + end + end + end + + # helper method to convert compressed spaces and other elements within + # an text into a string + # FIXME: add a test for compressed_spaces == 0. It's not currently tested. + def children_to_string(children) + children.map do |child| + if child.text? + child.content + else + if child.name == 's' + compressed_spaces = child.attributes['c'].to_s.to_i + # no explicit number means a count of 1: + compressed_spaces == 0 ? ' ' : ' ' * compressed_spaces + else + child.content + end + end + end.join + end + + def attribute(node, attr_name) + node.attributes[attr_name].value if node.attributes[attr_name] + end + end +end diff --git a/lib/roo/spreadsheet.rb b/lib/roo/spreadsheet.rb new file mode 100644 index 0000000..1eef58d --- /dev/null +++ b/lib/roo/spreadsheet.rb @@ -0,0 +1,33 @@ +require 'uri' + +module Roo + class Spreadsheet + class << self + def open(path, options = {}) + path = path.respond_to?(:path) ? path.path : path + extension = extension_for(path, options) + + begin + Roo::CLASS_FOR_EXTENSION.fetch(extension).new(path, options) + rescue KeyError + raise ArgumentError, + "Can't detect the type of #{path} - please use the :extension option to declare its type." + end + end + + def extension_for(path, options) + case (extension = options.delete(:extension)) + when ::Symbol + options[:file_warning] = :ignore + extension + when ::String + options[:file_warning] = :ignore + extension.tr('.', '').downcase.to_sym + else + res = ::File.extname((path =~ /\A#{::URI.regexp}\z/) ? ::URI.parse(::URI.encode(path)).path : path) + res.tr('.', '').downcase.to_sym + end + end + end + end +end diff --git a/lib/roo/tempdir.rb b/lib/roo/tempdir.rb new file mode 100644 index 0000000..4be755b --- /dev/null +++ b/lib/roo/tempdir.rb @@ -0,0 +1,21 @@ +module Roo + module Tempdir + def finalize_tempdirs(object_id) + if @tempdirs && (dirs_to_remove = @tempdirs[object_id]) + @tempdirs.delete(object_id) + dirs_to_remove.each do |dir| + ::FileUtils.remove_entry(dir) + end + end + end + + def make_tempdir(object, prefix, root) + root ||= ENV["ROO_TMP"] + # NOTE: This folder is cleaned up by finalize_tempdirs. + ::Dir.mktmpdir("#{Roo::TEMP_PREFIX}#{prefix}", root).tap do |tmpdir| + @tempdirs ||= Hash.new { |h, k| h[k] = [] } + @tempdirs[object.object_id] << tmpdir + end + end + end +end diff --git a/lib/roo/utils.rb b/lib/roo/utils.rb new file mode 100644 index 0000000..17b672d --- /dev/null +++ b/lib/roo/utils.rb @@ -0,0 +1,78 @@ +module Roo + module Utils + extend self + + LETTERS = ('A'..'Z').to_a + + def split_coordinate(str) + @split_coordinate ||= {} + + @split_coordinate[str] ||= begin + letter, number = split_coord(str) + x = letter_to_number(letter) + y = number + [y, x] + end + end + + alias_method :ref_to_key, :split_coordinate + + def split_coord(s) + if s =~ /([a-zA-Z]+)([0-9]+)/ + letter = Regexp.last_match[1] + number = Regexp.last_match[2].to_i + else + fail ArgumentError + end + [letter, number] + end + + # convert a number to something like 'AB' (1 => 'A', 2 => 'B', ...) + def number_to_letter(num) + result = "" + + until num.zero? + num, index = (num - 1).divmod(26) + result.prepend(LETTERS[index]) + end + + result + end + + def letter_to_number(letters) + @letter_to_number ||= {} + @letter_to_number[letters] ||= begin + result = 0 + + # :bytes method returns an enumerator in 1.9.3 and an array in 2.0+ + letters.bytes.to_a.map{|b| b > 96 ? b - 96 : b - 64 }.reverse.each_with_index{ |num, i| result += num * 26 ** i } + + result + end + end + + # Compute upper bound for cells in a given cell range. + def num_cells_in_range(str) + cells = str.split(':') + return 1 if cells.count == 1 + raise ArgumentError.new("invalid range string: #{str}. Supported range format 'A1:B2'") if cells.count != 2 + x1, y1 = split_coordinate(cells[0]) + x2, y2 = split_coordinate(cells[1]) + (x2 - (x1 - 1)) * (y2 - (y1 - 1)) + end + + def load_xml(path) + ::File.open(path, 'rb') do |file| + ::Nokogiri::XML(file) + end + end + + # Yield each element of a given type ('row', 'c', etc.) to caller + def each_element(path, elements) + Nokogiri::XML::Reader(::File.open(path, 'rb'), nil, nil, Nokogiri::XML::ParseOptions::NOBLANKS).each do |node| + next unless node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT && Array(elements).include?(node.name) + yield Nokogiri::XML(node.outer_xml).root if block_given? + end + end + end +end diff --git a/lib/roo/version.rb b/lib/roo/version.rb new file mode 100644 index 0000000..bdd16dc --- /dev/null +++ b/lib/roo/version.rb @@ -0,0 +1,3 @@ +module Roo + VERSION = "2.7.1" +end |
