diff options
Diffstat (limited to 'lib/roo/excelx')
| -rw-r--r-- | lib/roo/excelx/cell.rb | 106 | ||||
| -rw-r--r-- | lib/roo/excelx/cell/base.rb | 94 | ||||
| -rw-r--r-- | lib/roo/excelx/cell/boolean.rb | 27 | ||||
| -rw-r--r-- | lib/roo/excelx/cell/date.rb | 28 | ||||
| -rw-r--r-- | lib/roo/excelx/cell/datetime.rb | 111 | ||||
| -rw-r--r-- | lib/roo/excelx/cell/empty.rb | 19 | ||||
| -rw-r--r-- | lib/roo/excelx/cell/number.rb | 87 | ||||
| -rw-r--r-- | lib/roo/excelx/cell/string.rb | 19 | ||||
| -rw-r--r-- | lib/roo/excelx/cell/time.rb | 43 | ||||
| -rw-r--r-- | lib/roo/excelx/comments.rb | 55 | ||||
| -rw-r--r-- | lib/roo/excelx/coordinate.rb | 12 | ||||
| -rw-r--r-- | lib/roo/excelx/extractor.rb | 21 | ||||
| -rw-r--r-- | lib/roo/excelx/format.rb | 64 | ||||
| -rw-r--r-- | lib/roo/excelx/relationships.rb | 25 | ||||
| -rw-r--r-- | lib/roo/excelx/shared.rb | 32 | ||||
| -rwxr-xr-x | lib/roo/excelx/shared_strings.rb | 157 | ||||
| -rw-r--r-- | lib/roo/excelx/sheet.rb | 112 | ||||
| -rwxr-xr-x | lib/roo/excelx/sheet_doc.rb | 211 | ||||
| -rw-r--r-- | lib/roo/excelx/styles.rb | 64 | ||||
| -rw-r--r-- | lib/roo/excelx/workbook.rb | 59 |
20 files changed, 1346 insertions, 0 deletions
diff --git a/lib/roo/excelx/cell.rb b/lib/roo/excelx/cell.rb new file mode 100644 index 0000000..2fc78e1 --- /dev/null +++ b/lib/roo/excelx/cell.rb @@ -0,0 +1,106 @@ +require 'date' +require 'roo/excelx/cell/base' +require 'roo/excelx/cell/boolean' +require 'roo/excelx/cell/datetime' +require 'roo/excelx/cell/date' +require 'roo/excelx/cell/empty' +require 'roo/excelx/cell/number' +require 'roo/excelx/cell/string' +require 'roo/excelx/cell/time' + +module Roo + class Excelx + class Cell + attr_reader :formula, :value, :excelx_type, :excelx_value, :style, :hyperlink, :coordinate + attr_writer :value + + # DEPRECATED: Please use Cell.create_cell instead. + def initialize(value, type, formula, excelx_type, excelx_value, style, hyperlink, base_date, coordinate) + warn '[DEPRECATION] `Cell.new` is deprecated. Please use `Cell.create_cell` instead.' + @type = type + @formula = formula + @base_date = base_date if [:date, :datetime].include?(@type) + @excelx_type = excelx_type + @excelx_value = excelx_value + @style = style + @value = type_cast_value(value) + @value = Roo::Link.new(hyperlink, @value.to_s) if hyperlink + @coordinate = coordinate + end + + def type + case + when @formula + :formula + when @value.is_a?(Roo::Link) + :link + else + @type + end + end + + def self.create_cell(type, *values) + case type + when :string + Cell::String.new(*values) + when :boolean + Cell::Boolean.new(*values) + when :number + Cell::Number.new(*values) + when :date + Cell::Date.new(*values) + when :datetime + Cell::DateTime.new(*values) + when :time + Cell::Time.new(*values) + end + end + + # Deprecated: use Roo::Excelx::Coordinate instead. + class Coordinate + attr_accessor :row, :column + + def initialize(row, column) + warn '[DEPRECATION] `Roo::Excel::Cell::Coordinate` is deprecated. Please use `Roo::Excelx::Coordinate` instead.' + @row, @column = row, column + end + end + + private + + def type_cast_value(value) + case @type + when :float, :percentage + value.to_f + when :date + create_date(@base_date + value.to_i) + when :datetime + create_datetime(@base_date + value.to_f.round(6)) + when :time + value.to_f * 86_400 + else + value + end + end + + def create_date(date) + yyyy, mm, dd = date.strftime('%Y-%m-%d').split('-') + + ::Date.new(yyyy.to_i, mm.to_i, dd.to_i) + end + + def create_datetime(date) + datetime_string = date.strftime('%Y-%m-%d %H:%M:%S.%N') + t = round_datetime(datetime_string) + + ::DateTime.civil(t.year, t.month, t.day, t.hour, t.min, t.sec) + end + + def round_datetime(datetime_string) + /(?<yyyy>\d+)-(?<mm>\d+)-(?<dd>\d+) (?<hh>\d+):(?<mi>\d+):(?<ss>\d+.\d+)/ =~ datetime_string + + ::Time.new(yyyy.to_i, mm.to_i, dd.to_i, hh.to_i, mi.to_i, ss.to_r).round(0) + end + end + end +end diff --git a/lib/roo/excelx/cell/base.rb b/lib/roo/excelx/cell/base.rb new file mode 100644 index 0000000..aea8808 --- /dev/null +++ b/lib/roo/excelx/cell/base.rb @@ -0,0 +1,94 @@ +module Roo + class Excelx + class Cell + class Base + attr_reader :cell_type, :cell_value, :value + + # FIXME: I think style should be deprecated. Having a style attribute + # for a cell doesn't really accomplish much. It seems to be used + # when you want to export to excelx. + attr_reader :style + + + # FIXME: Updating a cell's value should be able tochange the cell's type, + # but that isn't currently possible. This will cause weird bugs + # when one changes the value of a Number cell to a String. e.g. + # + # cell = Cell::Number(*args) + # cell.value = 'Hello' + # cell.formatted_value # => Some unexpected value + # + # Here are two possible solutions to such issues: + # 1. Don't allow a cell's value to be updated. Use a method like + # `Sheet.update_cell` instead. The simple solution. + # 2. When `cell.value = ` is called, use injection to try and + # change the type of cell on the fly. But deciding what type + # of value to pass to `cell.value=`. isn't always obvious. e.g. + # `cell.value = Time.now` should convert a cell to a DateTime, + # not a Time cell. Time cells would be hard to recognize because + # they are integers. This approach would require a significant + # change to the code as written. The complex solution. + # + # If the first solution is used, then this method should be + # deprecated. + attr_writer :value + + def initialize(value, formula, excelx_type, style, link, coordinate) + @link = !!link + @cell_value = value + @cell_type = excelx_type + @formula = formula + @style = style + @coordinate = coordinate + @type = :base + @value = link? ? Roo::Link.new(link, value) : value + end + + def type + if formula? + :formula + elsif link? + :link + else + @type + end + end + + def formula? + !!@formula + end + + def link? + !!@link + end + + alias_method :formatted_value, :value + + def to_s + formatted_value + end + + # DEPRECATED: Please use link instead. + def hyperlink + warn '[DEPRECATION] `hyperlink` is deprecated. Please use `link` instead.' + end + + # DEPRECATED: Please use cell_value instead. + def excelx_value + warn '[DEPRECATION] `excelx_value` is deprecated. Please use `cell_value` instead.' + cell_value + end + + # DEPRECATED: Please use cell_type instead. + def excelx_type + warn '[DEPRECATION] `excelx_type` is deprecated. Please use `cell_type` instead.' + cell_type + end + + def empty? + false + end + end + end + end +end diff --git a/lib/roo/excelx/cell/boolean.rb b/lib/roo/excelx/cell/boolean.rb new file mode 100644 index 0000000..fe1f691 --- /dev/null +++ b/lib/roo/excelx/cell/boolean.rb @@ -0,0 +1,27 @@ +module Roo + class Excelx + class Cell + class Boolean < Cell::Base + attr_reader :value, :formula, :format, :cell_type, :cell_value, :link, :coordinate + + def initialize(value, formula, style, link, coordinate) + super(value, formula, nil, style, link, coordinate) + @type = @cell_type = :boolean + @value = link? ? Roo::Link.new(link, value) : create_boolean(value) + end + + def formatted_value + value ? 'TRUE'.freeze : 'FALSE'.freeze + end + + private + + def create_boolean(value) + # FIXME: Using a boolean will cause methods like Base#to_csv to fail. + # Roo is using some method to ignore false/nil values. + value.to_i == 1 ? true : false + end + end + end + end +end diff --git a/lib/roo/excelx/cell/date.rb b/lib/roo/excelx/cell/date.rb new file mode 100644 index 0000000..8e2c6cb --- /dev/null +++ b/lib/roo/excelx/cell/date.rb @@ -0,0 +1,28 @@ +require 'date' + +module Roo + class Excelx + class Cell + class Date < Roo::Excelx::Cell::DateTime + attr_reader :value, :formula, :format, :cell_type, :cell_value, :link, :coordinate + + def initialize(value, formula, excelx_type, style, link, base_date, coordinate) + # NOTE: Pass all arguments to the parent class, DateTime. + super + @type = :date + @format = excelx_type.last + @value = link? ? Roo::Link.new(link, value) : create_date(base_date, value) + end + + private + + def create_date(base_date, value) + date = base_date + value.to_i + yyyy, mm, dd = date.strftime('%Y-%m-%d').split('-') + + ::Date.new(yyyy.to_i, mm.to_i, dd.to_i) + end + end + end + end +end diff --git a/lib/roo/excelx/cell/datetime.rb b/lib/roo/excelx/cell/datetime.rb new file mode 100644 index 0000000..35d93ac --- /dev/null +++ b/lib/roo/excelx/cell/datetime.rb @@ -0,0 +1,111 @@ +require 'date' + +module Roo + class Excelx + class Cell + class DateTime < Cell::Base + attr_reader :value, :formula, :format, :cell_value, :link, :coordinate + + def initialize(value, formula, excelx_type, style, link, base_date, coordinate) + super(value, formula, excelx_type, style, link, coordinate) + @type = :datetime + @format = excelx_type.last + @value = link? ? Roo::Link.new(link, value) : create_datetime(base_date, value) + end + + # Public: Returns formatted value for a datetime. Format's can be an + # standard excel format, or a custom format. + # + # Standard formats follow certain conventions. Date fields for + # days, months, and years are separated with hyhens or + # slashes ("-", /") (e.g. 01-JAN, 1/13/15). Time fields for + # hours, minutes, and seconds are separated with a colon (e.g. + # 12:45:01). + # + # If a custom format follows those conventions, then the custom + # format will be used for the a cell's formatted value. + # Otherwise, the formatted value will be in the following + # format: 'YYYY-mm-dd HH:MM:SS' (e.g. "2015-07-10 20:33:15"). + # + # Examples + # formatted_value #=> '01-JAN' + # + # Returns a String representation of a cell's value. + def formatted_value + formatter = @format.downcase.split(' ').map do |part| + if (parsed_format = parse_date_or_time_format(part)) + parsed_format + else + warn 'Unable to parse custom format. Using "YYYY-mm-dd HH:MM:SS" format.' + return @value.strftime('%F %T') + end + end.join(' ') + + @value.strftime(formatter) + end + + private + + def parse_date_or_time_format(part) + date_regex = /(?<date>[dmy]+[\-\/][dmy]+([\-\/][dmy]+)?)/ + time_regex = /(?<time>(\[?[h]\]?+:)?[m]+(:?ss|:?s)?)/ + + if part[date_regex] == part + formats = DATE_FORMATS + elsif part[time_regex] + formats = TIME_FORMATS + else + return false + end + + part.gsub(/#{formats.keys.join('|')}/, formats) + end + + DATE_FORMATS = { + 'yyyy' => '%Y', # Year: 2000 + 'yy' => '%y', # Year: 00 + # mmmmm => J-D + 'mmmm' => '%B', # Month: January + 'mmm' => '%^b', # Month: JAN + 'mm' => '%m', # Month: 01 + 'm' => '%-m', # Month: 1 + 'dddd' => '%A', # Day of the Week: Sunday + 'ddd' => '%^a', # Day of the Week: SUN + 'dd' => '%d', # Day of the Month: 01 + 'd' => '%-d' # Day of the Month: 1 + # '\\\\'.freeze => ''.freeze, # NOTE: Fixes a custom format's output. + } + + TIME_FORMATS = { + 'hh' => '%H', # Hour (24): 01 + 'h' => '%-k'.freeze, # Hour (24): 1 + # 'hh'.freeze => '%I'.freeze, # Hour (12): 08 + # 'h'.freeze => '%-l'.freeze, # Hour (12): 8 + 'mm' => '%M', # Minute: 01 + # FIXME: is this used? Seems like 'm' is used for month, not minute. + 'm' => '%-M', # Minute: 1 + 'ss' => '%S', # Seconds: 01 + 's' => '%-S', # Seconds: 1 + 'am/pm' => '%p', # Meridian: AM + '000' => '%3N', # Fractional Seconds: thousandth. + '00' => '%2N', # Fractional Seconds: hundredth. + '0' => '%1N' # Fractional Seconds: tenths. + } + + def create_datetime(base_date, value) + date = base_date + value.to_f.round(6) + datetime_string = date.strftime('%Y-%m-%d %H:%M:%S.%N') + t = round_datetime(datetime_string) + + ::DateTime.civil(t.year, t.month, t.day, t.hour, t.min, t.sec) + end + + def round_datetime(datetime_string) + /(?<yyyy>\d+)-(?<mm>\d+)-(?<dd>\d+) (?<hh>\d+):(?<mi>\d+):(?<ss>\d+.\d+)/ =~ datetime_string + + ::Time.new(yyyy, mm, dd, hh, mi, ss.to_r).round(0) + end + end + end + end +end diff --git a/lib/roo/excelx/cell/empty.rb b/lib/roo/excelx/cell/empty.rb new file mode 100644 index 0000000..49a20e7 --- /dev/null +++ b/lib/roo/excelx/cell/empty.rb @@ -0,0 +1,19 @@ + +module Roo + class Excelx + class Cell + class Empty < Cell::Base + attr_reader :value, :formula, :format, :cell_type, :cell_value, :hyperlink, :coordinate + + def initialize(coordinate) + @value = @formula = @format = @cell_type = @cell_value = @hyperlink = nil + @coordinate = coordinate + end + + def empty? + true + end + end + end + end +end diff --git a/lib/roo/excelx/cell/number.rb b/lib/roo/excelx/cell/number.rb new file mode 100644 index 0000000..2015562 --- /dev/null +++ b/lib/roo/excelx/cell/number.rb @@ -0,0 +1,87 @@ +module Roo + class Excelx + class Cell + class Number < Cell::Base + attr_reader :value, :formula, :format, :cell_value, :link, :coordinate + + def initialize(value, formula, excelx_type, style, link, coordinate) + super + # FIXME: change @type to number. This will break brittle tests. + # FIXME: Excelx_type is an array, but the first value isn't used. + @type = :float + @format = excelx_type.last + @value = link? ? Roo::Link.new(link, value) : create_numeric(value) + end + + def create_numeric(number) + return number if Excelx::ERROR_VALUES.include?(number) + case @format + when /%/ + Float(number) + when /\.0/ + Float(number) + else + (number.include?('.') || (/\A[-+]?\d+E[-+]\d+\z/i =~ number)) ? Float(number) : Integer(number) + end + end + + def formatted_value + return @cell_value if Excelx::ERROR_VALUES.include?(@cell_value) + + formatter = formats[@format] + if formatter.is_a? Proc + formatter.call(@cell_value) + elsif zero_padded_number? + "%0#{@format.size}d" % @cell_value + else + Kernel.format(formatter, @cell_value) + end + end + + def formats + # FIXME: numbers can be other colors besides red: + # [BLACK], [BLUE], [CYAN], [GREEN], [MAGENTA], [RED], [WHITE], [YELLOW], [COLOR n] + { + 'General' => '%.0f', + '0' => '%.0f', + '0.00' => '%.2f', + '0.000000' => '%.6f', + '#,##0' => number_format('%.0f'), + '#,##0.00' => number_format('%.2f'), + '0%' => proc do |number| + Kernel.format('%d%', number.to_f * 100) + end, + '0.00%' => proc do |number| + Kernel.format('%.2f%', number.to_f * 100) + end, + '0.00E+00' => '%.2E', + '#,##0 ;(#,##0)' => number_format('%.0f', '(%.0f)'), + '#,##0 ;[Red](#,##0)' => number_format('%.0f', '[Red](%.0f)'), + '#,##0.00;(#,##0.00)' => number_format('%.2f', '(%.2f)'), + '#,##0.00;[Red](#,##0.00)' => number_format('%.2f', '[Red](%.2f)'), + # FIXME: not quite sure what the format should look like in this case. + '##0.0E+0' => '%.1E', + '@' => proc { |number| number } + } + end + + private + + def number_format(formatter, negative_formatter = nil) + proc do |number| + if negative_formatter + formatter = number.to_i > 0 ? formatter : negative_formatter + number = number.to_f.abs + end + + Kernel.format(formatter, number).reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse + end + end + + def zero_padded_number? + @format[/0+/] == @format + end + end + end + end +end diff --git a/lib/roo/excelx/cell/string.rb b/lib/roo/excelx/cell/string.rb new file mode 100644 index 0000000..7967806 --- /dev/null +++ b/lib/roo/excelx/cell/string.rb @@ -0,0 +1,19 @@ +module Roo + class Excelx + class Cell + class String < Cell::Base + attr_reader :value, :formula, :format, :cell_type, :cell_value, :link, :coordinate + + def initialize(value, formula, style, link, coordinate) + super(value, formula, nil, style, link, coordinate) + @type = @cell_type = :string + @value = link? ? Roo::Link.new(link, value) : value + end + + def empty? + value.empty? + end + end + end + end +end diff --git a/lib/roo/excelx/cell/time.rb b/lib/roo/excelx/cell/time.rb new file mode 100644 index 0000000..d661ab8 --- /dev/null +++ b/lib/roo/excelx/cell/time.rb @@ -0,0 +1,43 @@ +require 'date' + +module Roo + class Excelx + class Cell + class Time < Roo::Excelx::Cell::DateTime + attr_reader :value, :formula, :format, :cell_value, :link, :coordinate + + def initialize(value, formula, excelx_type, style, link, base_date, coordinate) + # NOTE: Pass all arguments to DateTime super class. + super + @type = :time + @format = excelx_type.last + @datetime = create_datetime(base_date, value) + @value = link? ? Roo::Link.new(link, value) : (value.to_f * 86_400).to_i + end + + def formatted_value + formatter = @format.gsub(/#{TIME_FORMATS.keys.join('|')}/, TIME_FORMATS) + @datetime.strftime(formatter) + end + + alias_method :to_s, :formatted_value + + private + + # def create_datetime(base_date, value) + # date = base_date + value.to_f.round(6) + # datetime_string = date.strftime('%Y-%m-%d %H:%M:%S.%N') + # t = round_datetime(datetime_string) + # + # ::DateTime.civil(t.year, t.month, t.day, t.hour, t.min, t.sec) + # end + + # def round_datetime(datetime_string) + # /(?<yyyy>\d+)-(?<mm>\d+)-(?<dd>\d+) (?<hh>\d+):(?<mi>\d+):(?<ss>\d+.\d+)/ =~ datetime_string + # + # ::Time.new(yyyy.to_i, mm.to_i, dd.to_i, hh.to_i, mi.to_i, ss.to_r).round(0) + # end + end + end + end +end diff --git a/lib/roo/excelx/comments.rb b/lib/roo/excelx/comments.rb new file mode 100644 index 0000000..1a89908 --- /dev/null +++ b/lib/roo/excelx/comments.rb @@ -0,0 +1,55 @@ +require 'roo/excelx/extractor' + +module Roo + class Excelx + class Comments < Excelx::Extractor + def comments + @comments ||= extract_comments + end + + private + + def extract_comments + return {} unless doc_exists? + + Hash[doc.xpath('//comments/commentList/comment').map do |comment| + value = (comment.at_xpath('./text/r/t') || comment.at_xpath('./text/t')).text + [::Roo::Utils.ref_to_key(comment.attributes['ref'].to_s), value] + end] + end + end + end +end +# xl/comments1.xml +# <?xml version="1.0" encoding="UTF-8" standalone="yes" ?> +# <comments xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"> +# <authors> +# <author /> +# </authors> +# <commentList> +# <comment ref="B4" authorId="0"> +# <text> +# <r> +# <rPr> +# <sz val="10" /> +# <rFont val="Arial" /> +# <family val="2" /> +# </rPr> +# <t>Comment for B4</t> +# </r> +# </text> +# </comment> +# <comment ref="B5" authorId="0"> +# <text> +# <r> +# <rPr> +# <sz val="10" /> +# <rFont val="Arial" /> +# <family val="2" /> +# </rPr> +# <t>Comment for B5</t> +# </r> +# </text> +# </comment> +# </commentList> +# </comments> diff --git a/lib/roo/excelx/coordinate.rb b/lib/roo/excelx/coordinate.rb new file mode 100644 index 0000000..53b24ba --- /dev/null +++ b/lib/roo/excelx/coordinate.rb @@ -0,0 +1,12 @@ +module Roo + class Excelx + class Coordinate + attr_accessor :row, :column + + def initialize(row, column) + @row = row + @column = column + end + end + end +end diff --git a/lib/roo/excelx/extractor.rb b/lib/roo/excelx/extractor.rb new file mode 100644 index 0000000..1cdd13b --- /dev/null +++ b/lib/roo/excelx/extractor.rb @@ -0,0 +1,21 @@ +module Roo + class Excelx + class Extractor + def initialize(path) + @path = path + end + + private + + def doc + raise FileNotFound, "#{@path} file not found" unless doc_exists? + + ::Roo::Utils.load_xml(@path).remove_namespaces! + end + + def doc_exists? + @path && File.exist?(@path) + end + end + end +end diff --git a/lib/roo/excelx/format.rb b/lib/roo/excelx/format.rb new file mode 100644 index 0000000..72b36d9 --- /dev/null +++ b/lib/roo/excelx/format.rb @@ -0,0 +1,64 @@ +module Roo + class Excelx + module Format + EXCEPTIONAL_FORMATS = { + 'h:mm am/pm' => :date, + 'h:mm:ss am/pm' => :date + } + + STANDARD_FORMATS = { + 0 => 'General'.freeze, + 1 => '0'.freeze, + 2 => '0.00'.freeze, + 3 => '#,##0'.freeze, + 4 => '#,##0.00'.freeze, + 9 => '0%'.freeze, + 10 => '0.00%'.freeze, + 11 => '0.00E+00'.freeze, + 12 => '# ?/?'.freeze, + 13 => '# ??/??'.freeze, + 14 => 'mm-dd-yy'.freeze, + 15 => 'd-mmm-yy'.freeze, + 16 => 'd-mmm'.freeze, + 17 => 'mmm-yy'.freeze, + 18 => 'h:mm AM/PM'.freeze, + 19 => 'h:mm:ss AM/PM'.freeze, + 20 => 'h:mm'.freeze, + 21 => 'h:mm:ss'.freeze, + 22 => 'm/d/yy h:mm'.freeze, + 37 => '#,##0 ;(#,##0)'.freeze, + 38 => '#,##0 ;[Red](#,##0)'.freeze, + 39 => '#,##0.00;(#,##0.00)'.freeze, + 40 => '#,##0.00;[Red](#,##0.00)'.freeze, + 45 => 'mm:ss'.freeze, + 46 => '[h]:mm:ss'.freeze, + 47 => 'mmss.0'.freeze, + 48 => '##0.0E+0'.freeze, + 49 => '@'.freeze + } + + def to_type(format) + format = format.to_s.downcase + if (type = EXCEPTIONAL_FORMATS[format]) + type + elsif format.include?('#') + :float + elsif !format.match(/d+(?![\]])/).nil? || format.include?('y') + if format.include?('h') || format.include?('s') + :datetime + else + :date + end + elsif format.include?('h') || format.include?('s') + :time + elsif format.include?('%') + :percentage + else + :float + end + end + + module_function :to_type + end + end +end diff --git a/lib/roo/excelx/relationships.rb b/lib/roo/excelx/relationships.rb new file mode 100644 index 0000000..8a0ed97 --- /dev/null +++ b/lib/roo/excelx/relationships.rb @@ -0,0 +1,25 @@ +require 'roo/excelx/extractor' + +module Roo + class Excelx + class Relationships < Excelx::Extractor + def [](index) + to_a[index] + end + + def to_a + @relationships ||= extract_relationships + end + + private + + def extract_relationships + return [] unless doc_exists? + + Hash[doc.xpath('/Relationships/Relationship').map do |rel| + [rel.attribute('Id').text, rel] + end] + end + end + end +end diff --git a/lib/roo/excelx/shared.rb b/lib/roo/excelx/shared.rb new file mode 100644 index 0000000..3677fa2 --- /dev/null +++ b/lib/roo/excelx/shared.rb @@ -0,0 +1,32 @@ +module Roo + class Excelx + # Public: Shared class for allowing sheets to share data. This should + # reduce memory usage and reduce the number of objects being passed + # to various inititializers. + class Shared + attr_accessor :comments_files, :sheet_files, :rels_files + def initialize(dir) + @dir = dir + @comments_files = [] + @sheet_files = [] + @rels_files = [] + end + + def styles + @styles ||= Styles.new(File.join(@dir, 'roo_styles.xml')) + end + + def shared_strings + @shared_strings ||= SharedStrings.new(File.join(@dir, 'roo_sharedStrings.xml')) + end + + def workbook + @workbook ||= Workbook.new(File.join(@dir, 'roo_workbook.xml')) + end + + def base_date + workbook.base_date + end + end + end +end diff --git a/lib/roo/excelx/shared_strings.rb b/lib/roo/excelx/shared_strings.rb new file mode 100755 index 0000000..f7caf7c --- /dev/null +++ b/lib/roo/excelx/shared_strings.rb @@ -0,0 +1,157 @@ +require 'roo/excelx/extractor' + +module Roo + class Excelx + class SharedStrings < Excelx::Extractor + + COMMON_STRINGS = { + t: "t", + r: "r", + html_tag_open: "<html>", + html_tag_closed: "</html>" + } + + def [](index) + to_a[index] + end + + def to_a + @array ||= extract_shared_strings + end + + def to_html + @html ||= extract_html + end + + # Use to_html or to_a for html returns + # See what is happening with commit??? + def use_html?(index) + to_html[index][/<([biu]|sup|sub)>/] + end + + private + + def fix_invalid_shared_strings(doc) + invalid = { '_x000D_' => "\n" } + xml = doc.to_s + return doc unless xml[/#{invalid.keys.join('|')}/] + + ::Nokogiri::XML(xml.gsub(/#{invalid.keys.join('|')}/, invalid)) + end + + def extract_shared_strings + return [] unless doc_exists? + + document = fix_invalid_shared_strings(doc) + # read the shared strings xml document + document.xpath('/sst/si').map do |si| + shared_string = '' + si.children.each do |elem| + case elem.name + when 'r' + elem.children.each do |r_elem| + shared_string << r_elem.content if r_elem.name == 't' + end + when 't' + shared_string = elem.content + end + end + shared_string + end + end + + def extract_html + return [] unless doc_exists? + fix_invalid_shared_strings(doc) + # read the shared strings xml document + doc.xpath('/sst/si').map do |si| + html_string = '<html>' + si.children.each do |elem| + case elem.name + when 'r' + html_string << extract_html_r(elem) + when 't' + html_string << elem.content + end # case elem.name + end # si.children.each do |elem| + html_string << '</html>' + end # doc.xpath('/sst/si').map do |si| + end # def extract_html + + # The goal of this function is to take the following XML code snippet and create a html tag + # r_elem ::: XML Element that is in sharedStrings.xml of excel_book.xlsx + # {code:xml} + # <r> + # <rPr> + # <i/> + # <b/> + # <u/> + # <vertAlign val="subscript"/> + # <vertAlign val="superscript"/> + # </rPr> + # <t>TEXT</t> + # </r> + # {code} + # + # Expected Output ::: "<html><sub|sup><b><i><u>TEXT</u></i></b></sub|/sup></html>" + def extract_html_r(r_elem) + str = '' + xml_elems = { + sub: false, + sup: false, + b: false, + i: false, + u: false + } + b, i, u, sub, sup = false, false, false, false, false + r_elem.children.each do |elem| + case elem.name + when 'rPr' + elem.children.each do |rPr_elem| + case rPr_elem.name + when 'b' + # set formatting for Bold to true + xml_elems[:b] = true + when 'i' + # set formatting for Italics to true + xml_elems[:i] = true + when 'u' + # set formatting for Underline to true + xml_elems[:u] = true + when 'vertAlign' + # See if the Vertical Alignment is subscript or superscript + case rPr_elem.xpath('@val').first.value + when 'subscript' + # set formatting for Subscript to true and Superscript to false ... Can't have both + xml_elems[:sub] = true + xml_elems[:sup] = false + when 'superscript' + # set formatting for Superscript to true and Subscript to false ... Can't have both + xml_elems[:sup] = true + xml_elems[:sub] = false + end + end + end + when 't' + str << create_html(elem.content, xml_elems) + end + end + str + end # extract_html_r + + # This will return an html string + def create_html(text, formatting) + tmp_str = '' + formatting.each do |elem, val| + tmp_str << "<#{elem}>" if val + end + tmp_str << text + reverse_format = Hash[formatting.to_a.reverse] + reverse_format.each do |elem, val| + tmp_str << "</#{elem}>" if val + end + tmp_str + end + end # class SharedStrings < Excelx::Extractor + end # class Excelx +end # module Roo diff --git a/lib/roo/excelx/sheet.rb b/lib/roo/excelx/sheet.rb new file mode 100644 index 0000000..add92f0 --- /dev/null +++ b/lib/roo/excelx/sheet.rb @@ -0,0 +1,112 @@ +require 'forwardable' +module Roo + class Excelx + class Sheet + extend Forwardable + + delegate [:styles, :workbook, :shared_strings, :rels_files, :sheet_files, :comments_files] => :@shared + + def initialize(name, shared, sheet_index, options = {}) + @name = name + @shared = shared + @rels = Relationships.new(rels_files[sheet_index]) + @comments = Comments.new(comments_files[sheet_index]) + @sheet = SheetDoc.new(sheet_files[sheet_index], @rels, shared, options) + end + + def cells + @cells ||= @sheet.cells(@rels) + end + + def present_cells + @present_cells ||= cells.select { |_, cell| cell && !cell.empty? } + end + + # Yield each row as array of Excelx::Cell objects + # accepts options max_rows (int) (offset by 1 for header), + # pad_cells (boolean) and offset (int) + def each_row(options = {}, &block) + row_count = 0 + options[:offset] ||= 0 + @sheet.each_row_streaming do |row| + break if options[:max_rows] && row_count == options[:max_rows] + options[:offset] + 1 + if block_given? && !(options[:offset] && row_count < options[:offset]) + block.call(cells_for_row_element(row, options)) + end + row_count += 1 + end + end + + def row(row_number) + first_column.upto(last_column).map do |col| + cells[[row_number, col]] + end.map { |cell| cell && cell.value } + end + + def column(col_number) + first_row.upto(last_row).map do |row| + cells[[row, col_number]] + end.map { |cell| cell && cell.value } + end + + # returns the number of the first non-empty row + def first_row + @first_row ||= present_cells.keys.map { |row, _| row }.min + end + + def last_row + @last_row ||= present_cells.keys.map { |row, _| row }.max + end + + # returns the number of the first non-empty column + def first_column + @first_column ||= present_cells.keys.map { |_, col| col }.min + end + + # returns the number of the last non-empty column + def last_column + @last_column ||= present_cells.keys.map { |_, col| col }.max + end + + def excelx_format(key) + cell = cells[key] + styles.style_format(cell.style).to_s if cell + end + + def hyperlinks + @hyperlinks ||= @sheet.hyperlinks(@rels) + end + + def comments + @comments.comments + end + + def dimensions + @sheet.dimensions + end + + private + + # Take an xml row and return an array of Excelx::Cell objects + # optionally pad array to header width(assumed 1st row). + # takes option pad_cells (boolean) defaults false + def cells_for_row_element(row_element, options = {}) + return [] unless row_element + cell_col = 0 + cells = [] + @sheet.each_cell(row_element) do |cell| + cells.concat(pad_cells(cell, cell_col)) if options[:pad_cells] + cells << cell + cell_col = cell.coordinate.column + end + cells + end + + def pad_cells(cell, last_column) + pad = [] + (cell.coordinate.column - 1 - last_column).times { pad << nil } + pad + end + end + end +end diff --git a/lib/roo/excelx/sheet_doc.rb b/lib/roo/excelx/sheet_doc.rb new file mode 100755 index 0000000..a705958 --- /dev/null +++ b/lib/roo/excelx/sheet_doc.rb @@ -0,0 +1,211 @@ +require 'forwardable' +require 'roo/excelx/extractor' + +module Roo + class Excelx + class SheetDoc < Excelx::Extractor + extend Forwardable + delegate [:styles, :workbook, :shared_strings, :base_date] => :@shared + + def initialize(path, relationships, shared, options = {}) + super(path) + @shared = shared + @options = options + @relationships = relationships + end + + def cells(relationships) + @cells ||= extract_cells(relationships) + end + + def hyperlinks(relationships) + @hyperlinks ||= extract_hyperlinks(relationships) + end + + # Get the dimensions for the sheet. + # This is the upper bound of cells that might + # be parsed. (the document may be sparse so cell count is only upper bound) + def dimensions + @dimensions ||= extract_dimensions + end + + # Yield each row xml element to caller + def each_row_streaming(&block) + Roo::Utils.each_element(@path, 'row', &block) + end + + # Yield each cell as Excelx::Cell to caller for given + # row xml + def each_cell(row_xml) + return [] unless row_xml + row_xml.children.each do |cell_element| + # If you're sure you're not going to need this hyperlinks you can discard it + hyperlinks = unless @options[:no_hyperlinks] + key = ::Roo::Utils.ref_to_key(cell_element['r']) + hyperlinks(@relationships)[key] + end + + yield cell_from_xml(cell_element, hyperlinks) + end + end + + private + + def cell_value_type(type, format) + case type + when 's'.freeze + :shared + when 'b'.freeze + :boolean + when 'str'.freeze + :string + when 'inlineStr'.freeze + :inlinestr + else + Excelx::Format.to_type(format) + end + end + + # Internal: Creates a cell based on an XML clell.. + # + # cell_xml - a Nokogiri::XML::Element. e.g. + # <c r="A5" s="2"> + # <v>22606</v> + # </c> + # hyperlink - a String for the hyperlink for the cell or nil when no + # hyperlink is present. + # + # Examples + # + # cells_from_xml(<Nokogiri::XML::Element>, nil) + # # => <Excelx::Cell::String> + # + # Returns a type of <Excelx::Cell>. + def cell_from_xml(cell_xml, hyperlink) + coordinate = extract_coordinate(cell_xml['r']) + return Excelx::Cell::Empty.new(coordinate) if cell_xml.children.empty? + + # NOTE: This is error prone, to_i will silently turn a nil into a 0. + # This works by coincidence because Format[0] is General. + style = cell_xml['s'].to_i + format = styles.style_format(style) + value_type = cell_value_type(cell_xml['t'], format) + formula = nil + + cell_xml.children.each do |cell| + case cell.name + when 'is' + content_arr = cell.search('t').map(&:content) + unless content_arr.empty? + return Excelx::Cell.create_cell(:string, content_arr.join(''), formula, style, hyperlink, coordinate) + end + when 'f' + formula = cell.content + when 'v' + return create_cell_from_value(value_type, cell, formula, format, style, hyperlink, base_date, coordinate) + end + end + + Excelx::Cell::Empty.new(coordinate) + end + + def create_cell_from_value(value_type, cell, formula, format, style, hyperlink, base_date, coordinate) + # NOTE: format.to_s can replace excelx_type as an argument for + # Cell::Time, Cell::DateTime, Cell::Date or Cell::Number, but + # it will break some brittle tests. + excelx_type = [:numeric_or_formula, format.to_s] + + # NOTE: There are only a few situations where value != cell.content + # 1. when a sharedString is used. value = sharedString; + # cell.content = id of sharedString + # 2. boolean cells: value = 'TRUE' | 'FALSE'; cell.content = '0' | '1'; + # But a boolean cell should use TRUE|FALSE as the formatted value + # and use a Boolean for it's value. Using a Boolean value breaks + # Roo::Base#to_csv. + # 3. formula + case value_type + when :shared + value = shared_strings.use_html?(cell.content.to_i) ? shared_strings.to_html[cell.content.to_i] : shared_strings[cell.content.to_i] + Excelx::Cell.create_cell(:string, value, formula, style, hyperlink, coordinate) + when :boolean, :string + value = cell.content + Excelx::Cell.create_cell(value_type, value, formula, style, hyperlink, coordinate) + when :time, :datetime + cell_content = cell.content.to_f + # NOTE: A date will be a whole number. A time will have be > 1. And + # in general, a datetime will have decimals. But if the cell is + # using a custom format, it's possible to be interpreted incorrectly. + # cell_content.to_i == cell_content && standard_style?=> :date + # + # Should check to see if the format is standard or not. If it's a + # standard format, than it's a date, otherwise, it is a datetime. + # @styles.standard_style?(style_id) + # STANDARD_STYLES.keys.include?(style_id.to_i) + cell_type = if cell_content < 1.0 + :time + elsif (cell_content - cell_content.floor).abs > 0.000001 + :datetime + else + :date + end + Excelx::Cell.create_cell(cell_type, cell.content, formula, excelx_type, style, hyperlink, base_date, coordinate) + when :date + Excelx::Cell.create_cell(value_type, cell.content, formula, excelx_type, style, hyperlink, base_date, coordinate) + else + Excelx::Cell.create_cell(:number, cell.content, formula, excelx_type, style, hyperlink, coordinate) + end + end + + def extract_coordinate(coordinate) + row, column = ::Roo::Utils.split_coordinate(coordinate) + + Excelx::Coordinate.new(row, column) + end + + def extract_hyperlinks(relationships) + return {} unless (hyperlinks = doc.xpath('/worksheet/hyperlinks/hyperlink')) + + Hash[hyperlinks.map do |hyperlink| + if hyperlink.attribute('id') && (relationship = relationships[hyperlink.attribute('id').text]) + [::Roo::Utils.ref_to_key(hyperlink.attributes['ref'].to_s), relationship.attribute('Target').text] + end + end.compact] + end + + def expand_merged_ranges(cells) + # Extract merged ranges from xml + merges = {} + doc.xpath('/worksheet/mergeCells/mergeCell').each do |mergecell_xml| + tl, br = mergecell_xml['ref'].split(/:/).map { |ref| ::Roo::Utils.ref_to_key(ref) } + for row in tl[0]..br[0] do + for col in tl[1]..br[1] do + next if row == tl[0] && col == tl[1] + merges[[row, col]] = tl + end + end + end + # Duplicate value into all cells in merged range + merges.each do |dst, src| + cells[dst] = cells[src] + end + end + + def extract_cells(relationships) + extracted_cells = Hash[doc.xpath('/worksheet/sheetData/row/c').map do |cell_xml| + key = ::Roo::Utils.ref_to_key(cell_xml['r']) + [key, cell_from_xml(cell_xml, hyperlinks(relationships)[key])] + end] + + expand_merged_ranges(extracted_cells) if @options[:expand_merged_ranges] + + extracted_cells + end + + def extract_dimensions + Roo::Utils.each_element(@path, 'dimension') do |dimension| + return dimension.attributes['ref'].value + end + end + end + end +end diff --git a/lib/roo/excelx/styles.rb b/lib/roo/excelx/styles.rb new file mode 100644 index 0000000..87f1713 --- /dev/null +++ b/lib/roo/excelx/styles.rb @@ -0,0 +1,64 @@ +require 'roo/font' +require 'roo/excelx/extractor' + +module Roo + class Excelx + class Styles < Excelx::Extractor + # convert internal excelx attribute to a format + def style_format(style) + id = num_fmt_ids[style.to_i] + num_fmts[id] || Excelx::Format::STANDARD_FORMATS[id.to_i] + end + + def definitions + @definitions ||= extract_definitions + end + + private + + def num_fmt_ids + @num_fmt_ids ||= extract_num_fmt_ids + end + + def num_fmts + @num_fmts ||= extract_num_fmts + end + + def fonts + @fonts ||= extract_fonts + end + + def extract_definitions + doc.xpath('//cellXfs').flat_map do |xfs| + xfs.children.map do |xf| + fonts[xf['fontId'].to_i] + end + end + end + + def extract_fonts + doc.xpath('//fonts/font').map do |font_el| + Font.new.tap do |font| + font.bold = !font_el.xpath('./b').empty? + font.italic = !font_el.xpath('./i').empty? + font.underline = !font_el.xpath('./u').empty? + end + end + end + + def extract_num_fmt_ids + doc.xpath('//cellXfs').flat_map do |xfs| + xfs.children.map do |xf| + xf['numFmtId'] + end + end.compact + end + + def extract_num_fmts + Hash[doc.xpath('//numFmt').map do |num_fmt| + [num_fmt['numFmtId'], num_fmt['formatCode']] + end] + end + end + end +end diff --git a/lib/roo/excelx/workbook.rb b/lib/roo/excelx/workbook.rb new file mode 100644 index 0000000..7ef841f --- /dev/null +++ b/lib/roo/excelx/workbook.rb @@ -0,0 +1,59 @@ +require 'roo/excelx/extractor' + +module Roo + class Excelx + class Workbook < Excelx::Extractor + class Label + attr_reader :sheet, :row, :col, :name + + def initialize(name, sheet, row, col) + @name = name + @sheet = sheet + @row = row.to_i + @col = ::Roo::Utils.letter_to_number(col) + end + + def key + [@row, @col] + end + end + + def initialize(path) + super + fail ArgumentError, 'missing required workbook file' unless doc_exists? + end + + def sheets + doc.xpath('//sheet') + end + + # aka labels + def defined_names + Hash[doc.xpath('//definedName').map do |defined_name| + # "Sheet1!$C$5" + sheet, coordinates = defined_name.text.split('!$', 2) + col, row = coordinates.split('$') + name = defined_name['name'] + [name, Label.new(name, sheet, row, col)] + end] + end + + def base_date + @base_date ||= + begin + # Default to 1900 (minus one day due to excel quirk) but use 1904 if + # it's set in the Workbook's workbookPr + # http://msdn.microsoft.com/en-us/library/ff530155(v=office.12).aspx + result = Date.new(1899, 12, 30) # default + doc.css('workbookPr[date1904]').each do |workbookPr| + if workbookPr['date1904'] =~ /true|1/i + result = Date.new(1904, 01, 01) + break + end + end + result + end + end + end + end +end |
