aboutsummaryrefslogtreecommitdiffstats
path: root/lib/roo/excelx.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/roo/excelx.rb')
-rw-r--r--lib/roo/excelx.rb448
1 files changed, 448 insertions, 0 deletions
diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb
new file mode 100644
index 0000000..82c1431
--- /dev/null
+++ b/lib/roo/excelx.rb
@@ -0,0 +1,448 @@
+require 'nokogiri'
+require 'zip/filesystem'
+require 'roo/link'
+require 'roo/tempdir'
+require 'roo/utils'
+require 'forwardable'
+require 'set'
+
+module Roo
+ class Excelx < Roo::Base
+ extend Roo::Tempdir
+ extend Forwardable
+
+ ERROR_VALUES = %w(#N/A #REF! #NAME? #DIV/0! #NULL! #VALUE! #NUM!).to_set
+
+ require 'roo/excelx/shared'
+ require 'roo/excelx/workbook'
+ require 'roo/excelx/shared_strings'
+ require 'roo/excelx/styles'
+ require 'roo/excelx/cell'
+ require 'roo/excelx/sheet'
+ require 'roo/excelx/relationships'
+ require 'roo/excelx/comments'
+ require 'roo/excelx/sheet_doc'
+ require 'roo/excelx/coordinate'
+ require 'roo/excelx/format'
+
+ delegate [:styles, :workbook, :shared_strings, :rels_files, :sheet_files, :comments_files] => :@shared
+ ExceedsMaxError = Class.new(StandardError)
+
+ # initialization and opening of a spreadsheet file
+ # values for packed: :zip
+ # optional cell_max (int) parameter for early aborting attempts to parse
+ # enormous documents.
+ def initialize(filename_or_stream, options = {})
+ packed = options[:packed]
+ file_warning = options.fetch(:file_warning, :error)
+ cell_max = options.delete(:cell_max)
+ sheet_options = {}
+ sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false)
+ sheet_options[:no_hyperlinks] = (options[:no_hyperlinks] || false)
+
+ unless is_stream?(filename_or_stream)
+ file_type_check(filename_or_stream, %w[.xlsx .xlsm], 'an Excel 2007', file_warning, packed)
+ basename = find_basename(filename_or_stream)
+ end
+
+ # NOTE: Create temp directory and allow Ruby to cleanup the temp directory
+ # when the object is garbage collected. Initially, the finalizer was
+ # created in the Roo::Tempdir module, but that led to a segfault
+ # when testing in Ruby 2.4.0.
+ @tmpdir = self.class.make_tempdir(self, basename, options[:tmpdir_root])
+ ObjectSpace.define_finalizer(self, self.class.finalize(object_id))
+
+ @shared = Shared.new(@tmpdir)
+ @filename = local_filename(filename_or_stream, @tmpdir, packed)
+ process_zipfile(@filename || filename_or_stream)
+
+ @sheet_names = workbook.sheets.map do |sheet|
+ unless options[:only_visible_sheets] && sheet['state'] == 'hidden'
+ sheet['name']
+ end
+ end.compact
+ @sheets = []
+ @sheets_by_name = Hash[@sheet_names.map.with_index do |sheet_name, n|
+ @sheets[n] = Sheet.new(sheet_name, @shared, n, sheet_options)
+ [sheet_name, @sheets[n]]
+ end]
+
+ if cell_max
+ cell_count = ::Roo::Utils.num_cells_in_range(sheet_for(options.delete(:sheet)).dimensions)
+ raise ExceedsMaxError.new("Excel file exceeds cell maximum: #{cell_count} > #{cell_max}") if cell_count > cell_max
+ end
+
+ super
+ rescue
+ self.class.finalize_tempdirs(object_id)
+ raise
+ end
+
+ def method_missing(method, *args)
+ if (label = workbook.defined_names[method.to_s])
+ safe_send(sheet_for(label.sheet).cells[label.key], :value)
+ else
+ # call super for methods like #a1
+ super
+ end
+ end
+
+ def sheets
+ @sheet_names
+ end
+
+ def sheet_for(sheet)
+ sheet ||= default_sheet
+ validate_sheet!(sheet)
+ @sheets_by_name[sheet]
+ end
+
+ # Returns the content of a spreadsheet-cell.
+ # (1,1) is the upper left corner.
+ # (1,1), (1,'A'), ('A',1), ('a',1) all refers to the
+ # cell at the first line and first row.
+ def cell(row, col, sheet = nil)
+ key = normalize(row, col)
+ safe_send(sheet_for(sheet).cells[key], :value)
+ end
+
+ def row(rownumber, sheet = nil)
+ sheet_for(sheet).row(rownumber)
+ end
+
+ # returns all values in this column as an array
+ # column numbers are 1,2,3,... like in the spreadsheet
+ def column(column_number, sheet = nil)
+ if column_number.is_a?(::String)
+ column_number = ::Roo::Utils.letter_to_number(column_number)
+ end
+ sheet_for(sheet).column(column_number)
+ end
+
+ # returns the number of the first non-empty row
+ def first_row(sheet = nil)
+ sheet_for(sheet).first_row
+ end
+
+ # returns the number of the last non-empty row
+ def last_row(sheet = nil)
+ sheet_for(sheet).last_row
+ end
+
+ # returns the number of the first non-empty column
+ def first_column(sheet = nil)
+ sheet_for(sheet).first_column
+ end
+
+ # returns the number of the last non-empty column
+ def last_column(sheet = nil)
+ sheet_for(sheet).last_column
+ end
+
+ # set a cell to a certain value
+ # (this will not be saved back to the spreadsheet file!)
+ def set(row, col, value, sheet = nil) #:nodoc:
+ key = normalize(row, col)
+ cell_type = cell_type_by_value(value)
+ sheet_for(sheet).cells[key] = Cell.new(value, cell_type, nil, cell_type, value, nil, nil, nil, Coordinate.new(row, col))
+ end
+
+ # Returns the formula at (row,col).
+ # Returns nil if there is no formula.
+ # The method #formula? checks if there is a formula.
+ def formula(row, col, sheet = nil)
+ key = normalize(row, col)
+ safe_send(sheet_for(sheet).cells[key], :formula)
+ end
+
+ # Predicate methods really should return a boolean
+ # value. Hopefully no one was relying on the fact that this
+ # previously returned either nil/formula
+ def formula?(*args)
+ !!formula(*args)
+ end
+
+ # returns each formula in the selected sheet as an array of tuples in following format
+ # [[row, col, formula], [row, col, formula],...]
+ def formulas(sheet = nil)
+ sheet_for(sheet).cells.select { |_, cell| cell.formula }.map do |(x, y), cell|
+ [x, y, cell.formula]
+ end
+ end
+
+ # Given a cell, return the cell's style
+ def font(row, col, sheet = nil)
+ key = normalize(row, col)
+ definition_index = safe_send(sheet_for(sheet).cells[key], :style)
+ styles.definitions[definition_index] if definition_index
+ end
+
+ # returns the type of a cell:
+ # * :float
+ # * :string,
+ # * :date
+ # * :percentage
+ # * :formula
+ # * :time
+ # * :datetime
+ def celltype(row, col, sheet = nil)
+ key = normalize(row, col)
+ safe_send(sheet_for(sheet).cells[key], :type)
+ end
+
+ # returns the internal type of an excel cell
+ # * :numeric_or_formula
+ # * :string
+ # Note: this is only available within the Excelx class
+ def excelx_type(row, col, sheet = nil)
+ key = normalize(row, col)
+ safe_send(sheet_for(sheet).cells[key], :cell_type)
+ end
+
+ # returns the internal value of an excelx cell
+ # Note: this is only available within the Excelx class
+ def excelx_value(row, col, sheet = nil)
+ key = normalize(row, col)
+ safe_send(sheet_for(sheet).cells[key], :cell_value)
+ end
+
+ # returns the internal value of an excelx cell
+ # Note: this is only available within the Excelx class
+ def formatted_value(row, col, sheet = nil)
+ key = normalize(row, col)
+ safe_send(sheet_for(sheet).cells[key], :formatted_value)
+ end
+
+ # returns the internal format of an excel cell
+ def excelx_format(row, col, sheet = nil)
+ key = normalize(row, col)
+ sheet_for(sheet).excelx_format(key)
+ end
+
+ def empty?(row, col, sheet = nil)
+ sheet = sheet_for(sheet)
+ key = normalize(row, col)
+ cell = sheet.cells[key]
+ !cell || cell.empty? ||
+ (row < sheet.first_row || row > sheet.last_row || col < sheet.first_column || col > sheet.last_column)
+ end
+
+ # shows the internal representation of all cells
+ # for debugging purposes
+ def to_s(sheet = nil)
+ sheet_for(sheet).cells.inspect
+ end
+
+ # returns the row,col values of the labelled cell
+ # (nil,nil) if label is not defined
+ def label(name)
+ labels = workbook.defined_names
+ return [nil, nil, nil] if labels.empty? || !labels.key?(name)
+
+ [labels[name].row, labels[name].col, labels[name].sheet]
+ end
+
+ # Returns an array which all labels. Each element is an array with
+ # [labelname, [row,col,sheetname]]
+ def labels
+ @labels ||= workbook.defined_names.map do |name, label|
+ [
+ name,
+ [label.row, label.col, label.sheet]
+ ]
+ end
+ end
+
+ def hyperlink?(row, col, sheet = nil)
+ !!hyperlink(row, col, sheet)
+ end
+
+ # returns the hyperlink at (row/col)
+ # nil if there is no hyperlink
+ def hyperlink(row, col, sheet = nil)
+ key = normalize(row, col)
+ sheet_for(sheet).hyperlinks[key]
+ end
+
+ # returns the comment at (row/col)
+ # nil if there is no comment
+ def comment(row, col, sheet = nil)
+ key = normalize(row, col)
+ sheet_for(sheet).comments[key]
+ end
+
+ # true, if there is a comment
+ def comment?(row, col, sheet = nil)
+ !!comment(row, col, sheet)
+ end
+
+ def comments(sheet = nil)
+ sheet_for(sheet).comments.map do |(x, y), comment|
+ [x, y, comment]
+ end
+ end
+
+ # Yield an array of Excelx::Cell
+ # Takes options for sheet, pad_cells, and max_rows
+ def each_row_streaming(options = {})
+ sheet = sheet_for(options.delete(:sheet))
+ if block_given?
+ sheet.each_row(options) { |row| yield row }
+ else
+ sheet.to_enum(:each_row, options)
+ end
+ end
+
+ private
+
+ def clean_sheet(sheet)
+ @sheets_by_name[sheet].cells.each_pair do |coord, value|
+ next unless value.value.is_a?(::String)
+
+ @sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value)
+ end
+
+ @cleaned[sheet] = true
+ end
+
+ # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx
+ # documents require a workbook.xml file, so a if the file is missing
+ # it is not a valid xlsx file. In these cases, an ArgumentError is
+ # raised.
+ #
+ # wb - a Zip::Entry for the workbook.xml file.
+ # path - A String for Zip::Entry's destination path.
+ #
+ # Examples
+ #
+ # extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml')
+ # # => ["rId1", "rId2", "rId3"]
+ #
+ # Returns an Array of Strings.
+ def extract_worksheet_ids(entries, path)
+ wb = entries.find { |e| e.name[/workbook.xml$/] }
+ fail ArgumentError 'missing required workbook file' if wb.nil?
+
+ wb.extract(path)
+ workbook_doc = Roo::Utils.load_xml(path).remove_namespaces!
+ workbook_doc.xpath('//sheet').map { |s| s.attributes['id'].value }
+ end
+
+ # Internal
+ #
+ # wb_rels - A Zip::Entry for the workbook.xml.rels file.
+ # path - A String for the Zip::Entry's destination path.
+ #
+ # Examples
+ #
+ # extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels')
+ # # => {
+ # "rId1"=>"worksheets/sheet1.xml",
+ # "rId2"=>"worksheets/sheet2.xml",
+ # "rId3"=>"worksheets/sheet3.xml"
+ # }
+ #
+ # Returns a Hash.
+ def extract_worksheet_rels(entries, path)
+ wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] }
+ fail ArgumentError 'missing required workbook file' if wb_rels.nil?
+
+ wb_rels.extract(path)
+ rels_doc = Roo::Utils.load_xml(path).remove_namespaces!
+ worksheet_type = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet'
+
+ relationships = rels_doc.xpath('//Relationship').select do |relationship|
+ relationship.attributes['Type'].value == worksheet_type
+ end
+
+ relationships.inject({}) do |hash, relationship|
+ attributes = relationship.attributes
+ id = attributes['Id']
+ hash[id.value] = attributes['Target'].value
+ hash
+ end
+ end
+
+ # Extracts the sheets in order, but it will ignore sheets that are not
+ # worksheets.
+ def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir)
+ (sheet_ids & sheets.keys).each_with_index do |id, i|
+ name = sheets[id]
+ entry = entries.find { |e| "/#{e.name}" =~ /#{name}$/ }
+ path = "#{tmpdir}/roo_sheet#{i + 1}"
+ sheet_files << path
+ @sheet_files << path
+ entry.extract(path)
+ end
+ end
+
+ # Extracts all needed files from the zip file
+ def process_zipfile(zipfilename_or_stream)
+ @sheet_files = []
+
+ unless is_stream?(zipfilename_or_stream)
+ zip_file = Zip::File.open(zipfilename_or_stream)
+ else
+ zip_file = Zip::CentralDirectory.new
+ zip_file.read_from_stream zipfilename_or_stream
+ end
+
+ process_zipfile_entries zip_file.to_a.sort_by(&:name)
+ end
+
+ def process_zipfile_entries(entries)
+ # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames
+ # are not in order. With Numbers 3.1, the first sheet is always
+ # sheet.xml, not sheet1.xml. With Google, the order of the worksheets is
+ # independent of a worksheet's filename (i.e. sheet6.xml can be the
+ # first worksheet).
+ #
+ # workbook.xml lists the correct order of worksheets and
+ # workbook.xml.rels lists the filenames for those worksheets.
+ #
+ # workbook.xml:
+ # <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/>
+ # <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/>
+ # workbook.xml.rel:
+ # <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
+ # <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/>
+ sheet_ids = extract_worksheet_ids(entries, "#{@tmpdir}/roo_workbook.xml")
+ sheets = extract_worksheet_rels(entries, "#{@tmpdir}/roo_workbook.xml.rels")
+ extract_sheets_in_order(entries, sheet_ids, sheets, @tmpdir)
+
+ entries.each do |entry|
+ path =
+ case entry.name.downcase
+ when /sharedstrings.xml$/
+ "#{@tmpdir}/roo_sharedStrings.xml"
+ when /styles.xml$/
+ "#{@tmpdir}/roo_styles.xml"
+ when /comments([0-9]+).xml$/
+ # FIXME: Most of the time, The order of the comment files are the same
+ # the sheet order, i.e. sheet1.xml's comments are in comments1.xml.
+ # In some situations, this isn't true. The true location of a
+ # sheet's comment file is in the sheet1.xml.rels file. SEE
+ # ECMA-376 12.3.3 in "Ecma Office Open XML Part 1".
+ nr = Regexp.last_match[1].to_i
+ comments_files[nr - 1] = "#{@tmpdir}/roo_comments#{nr}"
+ when %r{chartsheets/_rels/sheet([0-9]+).xml.rels$}
+ # NOTE: Chart sheet relationship files were interfering with
+ # worksheets.
+ nil
+ when /sheet([0-9]+).xml.rels$/
+ # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but
+ # it also stores the location for sharedStrings, comments,
+ # drawings, etc.
+ nr = Regexp.last_match[1].to_i
+ rels_files[nr - 1] = "#{@tmpdir}/roo_rels#{nr}"
+ end
+
+ entry.extract(path) if path
+ end
+ end
+
+ def safe_send(object, method, *args)
+ object.send(method, *args) if object && object.respond_to?(method)
+ end
+ end
+end