diff options
Diffstat (limited to 'lib/roo/excelx.rb')
| -rw-r--r-- | lib/roo/excelx.rb | 448 |
1 files changed, 448 insertions, 0 deletions
diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb new file mode 100644 index 0000000..82c1431 --- /dev/null +++ b/lib/roo/excelx.rb @@ -0,0 +1,448 @@ +require 'nokogiri' +require 'zip/filesystem' +require 'roo/link' +require 'roo/tempdir' +require 'roo/utils' +require 'forwardable' +require 'set' + +module Roo + class Excelx < Roo::Base + extend Roo::Tempdir + extend Forwardable + + ERROR_VALUES = %w(#N/A #REF! #NAME? #DIV/0! #NULL! #VALUE! #NUM!).to_set + + require 'roo/excelx/shared' + require 'roo/excelx/workbook' + require 'roo/excelx/shared_strings' + require 'roo/excelx/styles' + require 'roo/excelx/cell' + require 'roo/excelx/sheet' + require 'roo/excelx/relationships' + require 'roo/excelx/comments' + require 'roo/excelx/sheet_doc' + require 'roo/excelx/coordinate' + require 'roo/excelx/format' + + delegate [:styles, :workbook, :shared_strings, :rels_files, :sheet_files, :comments_files] => :@shared + ExceedsMaxError = Class.new(StandardError) + + # initialization and opening of a spreadsheet file + # values for packed: :zip + # optional cell_max (int) parameter for early aborting attempts to parse + # enormous documents. + def initialize(filename_or_stream, options = {}) + packed = options[:packed] + file_warning = options.fetch(:file_warning, :error) + cell_max = options.delete(:cell_max) + sheet_options = {} + sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false) + sheet_options[:no_hyperlinks] = (options[:no_hyperlinks] || false) + + unless is_stream?(filename_or_stream) + file_type_check(filename_or_stream, %w[.xlsx .xlsm], 'an Excel 2007', file_warning, packed) + basename = find_basename(filename_or_stream) + end + + # NOTE: Create temp directory and allow Ruby to cleanup the temp directory + # when the object is garbage collected. Initially, the finalizer was + # created in the Roo::Tempdir module, but that led to a segfault + # when testing in Ruby 2.4.0. + @tmpdir = self.class.make_tempdir(self, basename, options[:tmpdir_root]) + ObjectSpace.define_finalizer(self, self.class.finalize(object_id)) + + @shared = Shared.new(@tmpdir) + @filename = local_filename(filename_or_stream, @tmpdir, packed) + process_zipfile(@filename || filename_or_stream) + + @sheet_names = workbook.sheets.map do |sheet| + unless options[:only_visible_sheets] && sheet['state'] == 'hidden' + sheet['name'] + end + end.compact + @sheets = [] + @sheets_by_name = Hash[@sheet_names.map.with_index do |sheet_name, n| + @sheets[n] = Sheet.new(sheet_name, @shared, n, sheet_options) + [sheet_name, @sheets[n]] + end] + + if cell_max + cell_count = ::Roo::Utils.num_cells_in_range(sheet_for(options.delete(:sheet)).dimensions) + raise ExceedsMaxError.new("Excel file exceeds cell maximum: #{cell_count} > #{cell_max}") if cell_count > cell_max + end + + super + rescue + self.class.finalize_tempdirs(object_id) + raise + end + + def method_missing(method, *args) + if (label = workbook.defined_names[method.to_s]) + safe_send(sheet_for(label.sheet).cells[label.key], :value) + else + # call super for methods like #a1 + super + end + end + + def sheets + @sheet_names + end + + def sheet_for(sheet) + sheet ||= default_sheet + validate_sheet!(sheet) + @sheets_by_name[sheet] + end + + # Returns the content of a spreadsheet-cell. + # (1,1) is the upper left corner. + # (1,1), (1,'A'), ('A',1), ('a',1) all refers to the + # cell at the first line and first row. + def cell(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :value) + end + + def row(rownumber, sheet = nil) + sheet_for(sheet).row(rownumber) + end + + # returns all values in this column as an array + # column numbers are 1,2,3,... like in the spreadsheet + def column(column_number, sheet = nil) + if column_number.is_a?(::String) + column_number = ::Roo::Utils.letter_to_number(column_number) + end + sheet_for(sheet).column(column_number) + end + + # returns the number of the first non-empty row + def first_row(sheet = nil) + sheet_for(sheet).first_row + end + + # returns the number of the last non-empty row + def last_row(sheet = nil) + sheet_for(sheet).last_row + end + + # returns the number of the first non-empty column + def first_column(sheet = nil) + sheet_for(sheet).first_column + end + + # returns the number of the last non-empty column + def last_column(sheet = nil) + sheet_for(sheet).last_column + end + + # set a cell to a certain value + # (this will not be saved back to the spreadsheet file!) + def set(row, col, value, sheet = nil) #:nodoc: + key = normalize(row, col) + cell_type = cell_type_by_value(value) + sheet_for(sheet).cells[key] = Cell.new(value, cell_type, nil, cell_type, value, nil, nil, nil, Coordinate.new(row, col)) + end + + # Returns the formula at (row,col). + # Returns nil if there is no formula. + # The method #formula? checks if there is a formula. + def formula(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :formula) + end + + # Predicate methods really should return a boolean + # value. Hopefully no one was relying on the fact that this + # previously returned either nil/formula + def formula?(*args) + !!formula(*args) + end + + # returns each formula in the selected sheet as an array of tuples in following format + # [[row, col, formula], [row, col, formula],...] + def formulas(sheet = nil) + sheet_for(sheet).cells.select { |_, cell| cell.formula }.map do |(x, y), cell| + [x, y, cell.formula] + end + end + + # Given a cell, return the cell's style + def font(row, col, sheet = nil) + key = normalize(row, col) + definition_index = safe_send(sheet_for(sheet).cells[key], :style) + styles.definitions[definition_index] if definition_index + end + + # returns the type of a cell: + # * :float + # * :string, + # * :date + # * :percentage + # * :formula + # * :time + # * :datetime + def celltype(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :type) + end + + # returns the internal type of an excel cell + # * :numeric_or_formula + # * :string + # Note: this is only available within the Excelx class + def excelx_type(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :cell_type) + end + + # returns the internal value of an excelx cell + # Note: this is only available within the Excelx class + def excelx_value(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :cell_value) + end + + # returns the internal value of an excelx cell + # Note: this is only available within the Excelx class + def formatted_value(row, col, sheet = nil) + key = normalize(row, col) + safe_send(sheet_for(sheet).cells[key], :formatted_value) + end + + # returns the internal format of an excel cell + def excelx_format(row, col, sheet = nil) + key = normalize(row, col) + sheet_for(sheet).excelx_format(key) + end + + def empty?(row, col, sheet = nil) + sheet = sheet_for(sheet) + key = normalize(row, col) + cell = sheet.cells[key] + !cell || cell.empty? || + (row < sheet.first_row || row > sheet.last_row || col < sheet.first_column || col > sheet.last_column) + end + + # shows the internal representation of all cells + # for debugging purposes + def to_s(sheet = nil) + sheet_for(sheet).cells.inspect + end + + # returns the row,col values of the labelled cell + # (nil,nil) if label is not defined + def label(name) + labels = workbook.defined_names + return [nil, nil, nil] if labels.empty? || !labels.key?(name) + + [labels[name].row, labels[name].col, labels[name].sheet] + end + + # Returns an array which all labels. Each element is an array with + # [labelname, [row,col,sheetname]] + def labels + @labels ||= workbook.defined_names.map do |name, label| + [ + name, + [label.row, label.col, label.sheet] + ] + end + end + + def hyperlink?(row, col, sheet = nil) + !!hyperlink(row, col, sheet) + end + + # returns the hyperlink at (row/col) + # nil if there is no hyperlink + def hyperlink(row, col, sheet = nil) + key = normalize(row, col) + sheet_for(sheet).hyperlinks[key] + end + + # returns the comment at (row/col) + # nil if there is no comment + def comment(row, col, sheet = nil) + key = normalize(row, col) + sheet_for(sheet).comments[key] + end + + # true, if there is a comment + def comment?(row, col, sheet = nil) + !!comment(row, col, sheet) + end + + def comments(sheet = nil) + sheet_for(sheet).comments.map do |(x, y), comment| + [x, y, comment] + end + end + + # Yield an array of Excelx::Cell + # Takes options for sheet, pad_cells, and max_rows + def each_row_streaming(options = {}) + sheet = sheet_for(options.delete(:sheet)) + if block_given? + sheet.each_row(options) { |row| yield row } + else + sheet.to_enum(:each_row, options) + end + end + + private + + def clean_sheet(sheet) + @sheets_by_name[sheet].cells.each_pair do |coord, value| + next unless value.value.is_a?(::String) + + @sheets_by_name[sheet].cells[coord].value = sanitize_value(value.value) + end + + @cleaned[sheet] = true + end + + # Internal: extracts the worksheet_ids from the workbook.xml file. xlsx + # documents require a workbook.xml file, so a if the file is missing + # it is not a valid xlsx file. In these cases, an ArgumentError is + # raised. + # + # wb - a Zip::Entry for the workbook.xml file. + # path - A String for Zip::Entry's destination path. + # + # Examples + # + # extract_worksheet_ids(<Zip::Entry>, 'tmpdir/roo_workbook.xml') + # # => ["rId1", "rId2", "rId3"] + # + # Returns an Array of Strings. + def extract_worksheet_ids(entries, path) + wb = entries.find { |e| e.name[/workbook.xml$/] } + fail ArgumentError 'missing required workbook file' if wb.nil? + + wb.extract(path) + workbook_doc = Roo::Utils.load_xml(path).remove_namespaces! + workbook_doc.xpath('//sheet').map { |s| s.attributes['id'].value } + end + + # Internal + # + # wb_rels - A Zip::Entry for the workbook.xml.rels file. + # path - A String for the Zip::Entry's destination path. + # + # Examples + # + # extract_worksheets(<Zip::Entry>, 'tmpdir/roo_workbook.xml.rels') + # # => { + # "rId1"=>"worksheets/sheet1.xml", + # "rId2"=>"worksheets/sheet2.xml", + # "rId3"=>"worksheets/sheet3.xml" + # } + # + # Returns a Hash. + def extract_worksheet_rels(entries, path) + wb_rels = entries.find { |e| e.name[/workbook.xml.rels$/] } + fail ArgumentError 'missing required workbook file' if wb_rels.nil? + + wb_rels.extract(path) + rels_doc = Roo::Utils.load_xml(path).remove_namespaces! + worksheet_type = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet' + + relationships = rels_doc.xpath('//Relationship').select do |relationship| + relationship.attributes['Type'].value == worksheet_type + end + + relationships.inject({}) do |hash, relationship| + attributes = relationship.attributes + id = attributes['Id'] + hash[id.value] = attributes['Target'].value + hash + end + end + + # Extracts the sheets in order, but it will ignore sheets that are not + # worksheets. + def extract_sheets_in_order(entries, sheet_ids, sheets, tmpdir) + (sheet_ids & sheets.keys).each_with_index do |id, i| + name = sheets[id] + entry = entries.find { |e| "/#{e.name}" =~ /#{name}$/ } + path = "#{tmpdir}/roo_sheet#{i + 1}" + sheet_files << path + @sheet_files << path + entry.extract(path) + end + end + + # Extracts all needed files from the zip file + def process_zipfile(zipfilename_or_stream) + @sheet_files = [] + + unless is_stream?(zipfilename_or_stream) + zip_file = Zip::File.open(zipfilename_or_stream) + else + zip_file = Zip::CentralDirectory.new + zip_file.read_from_stream zipfilename_or_stream + end + + process_zipfile_entries zip_file.to_a.sort_by(&:name) + end + + def process_zipfile_entries(entries) + # NOTE: When Google or Numbers 3.1 exports to xlsx, the worksheet filenames + # are not in order. With Numbers 3.1, the first sheet is always + # sheet.xml, not sheet1.xml. With Google, the order of the worksheets is + # independent of a worksheet's filename (i.e. sheet6.xml can be the + # first worksheet). + # + # workbook.xml lists the correct order of worksheets and + # workbook.xml.rels lists the filenames for those worksheets. + # + # workbook.xml: + # <sheet state="visible" name="IS" sheetId="1" r:id="rId3"/> + # <sheet state="visible" name="BS" sheetId="2" r:id="rId4"/> + # workbook.xml.rel: + # <Relationship Id="rId4" Target="worksheets/sheet5.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/> + # <Relationship Id="rId3" Target="worksheets/sheet4.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"/> + sheet_ids = extract_worksheet_ids(entries, "#{@tmpdir}/roo_workbook.xml") + sheets = extract_worksheet_rels(entries, "#{@tmpdir}/roo_workbook.xml.rels") + extract_sheets_in_order(entries, sheet_ids, sheets, @tmpdir) + + entries.each do |entry| + path = + case entry.name.downcase + when /sharedstrings.xml$/ + "#{@tmpdir}/roo_sharedStrings.xml" + when /styles.xml$/ + "#{@tmpdir}/roo_styles.xml" + when /comments([0-9]+).xml$/ + # FIXME: Most of the time, The order of the comment files are the same + # the sheet order, i.e. sheet1.xml's comments are in comments1.xml. + # In some situations, this isn't true. The true location of a + # sheet's comment file is in the sheet1.xml.rels file. SEE + # ECMA-376 12.3.3 in "Ecma Office Open XML Part 1". + nr = Regexp.last_match[1].to_i + comments_files[nr - 1] = "#{@tmpdir}/roo_comments#{nr}" + when %r{chartsheets/_rels/sheet([0-9]+).xml.rels$} + # NOTE: Chart sheet relationship files were interfering with + # worksheets. + nil + when /sheet([0-9]+).xml.rels$/ + # FIXME: Roo seems to use sheet[\d].xml.rels for hyperlinks only, but + # it also stores the location for sharedStrings, comments, + # drawings, etc. + nr = Regexp.last_match[1].to_i + rels_files[nr - 1] = "#{@tmpdir}/roo_rels#{nr}" + end + + entry.extract(path) if path + end + end + + def safe_send(object, method, *args) + object.send(method, *args) if object && object.respond_to?(method) + end + end +end |
