From 51c7db64d6c969938a126cc6963405538c6ad074 Mon Sep 17 00:00:00 2001 From: James McKinney Date: Wed, 7 Oct 2015 14:01:04 -0400 Subject: [PATCH 1/2] Memoize the result of CSV#encode_re --- lib/csvlint/validate.rb | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/csvlint/validate.rb b/lib/csvlint/validate.rb index e87be357..0428ffd5 100644 --- a/lib/csvlint/validate.rb +++ b/lib/csvlint/validate.rb @@ -1,6 +1,16 @@ module Csvlint class Validator + class LineCSV < CSV + ENCODE_RE = Hash.new do |h,str| + h[str] = Regexp.new(str) + end + + # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273 + def encode_re(*chunks) + ENCODE_RE[encode_str(*chunks)] + end + end include Csvlint::ErrorCollector @@ -132,12 +142,12 @@ def parse_contents(stream, line = nil) @csv_options[:encoding] = @encoding begin - row = CSV.parse_line(stream, @csv_options) + row = LineCSV.parse_line(stream, @csv_options) # this is a one line substitute for CSV.new followed by row = CSV.shift. a CSV Row class is required # CSV.parse will return an array of arrays which breaks subsequent each_with_index invocations # TODO investigate if above would be a drag on memory - rescue CSV::MalformedCSVError => e + rescue LineCSV::MalformedCSVError => e build_exception_messages(e, stream, current_line) end @@ -228,7 +238,7 @@ def header? def report_line_breaks(line_no=nil) return if @input !~ /[\r|\n]/ # Return straight away if there's no newline character - i.e. we're on the last line - line_break = CSV.new(@input).row_sep + line_break = LineCSV.new(@input).row_sep @line_breaks << line_break unless line_breaks_reported? if line_break != "\r\n" From 77e54a46cb26639f4d535480597e8f49c8bf0468 Mon Sep 17 00:00:00 2001 From: James McKinney Date: Wed, 7 Oct 2015 15:15:24 -0400 Subject: [PATCH 2/2] Memoize the result of CSV#encode_str, CSV#escape_re, and disable CSV's converters feature --- lib/csvlint/validate.rb | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/lib/csvlint/validate.rb b/lib/csvlint/validate.rb index 0428ffd5..2ca707bc 100644 --- a/lib/csvlint/validate.rb +++ b/lib/csvlint/validate.rb @@ -6,10 +6,46 @@ class LineCSV < CSV h[str] = Regexp.new(str) end + ENCODE_STR = Hash.new do |h,encoding_name| + h[encoding_name] = Hash.new do |h,chunks| + h[chunks] = chunks.map { |chunk| chunk.encode(encoding_name) }.join('') + end + end + + ESCAPE_RE = Hash.new do |h,re_chars| + h[re_chars] = Hash.new do |h,re_esc| + h[re_esc] = Hash.new do |h,str| + h[str] = str.gsub(re_chars) {|c| re_esc + c} + end + end + end + + # Optimization: Memoize `encode_re`. # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2273 def encode_re(*chunks) ENCODE_RE[encode_str(*chunks)] end + + # Optimization: Memoize `encode_str`. + # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2281 + def encode_str(*chunks) + ENCODE_STR[@encoding.name][chunks] + end + + # Optimization: Memoize `escape_re`. + # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2265 + def escape_re(str) + ESCAPE_RE[@re_chars][@re_esc][str] + end + + # Optimization: Disable the CSV library's converters feature. + # @see https://github.com/ruby/ruby/blob/v2_2_3/lib/csv.rb#L2100 + def init_converters(options, field_name = :converters) + @converters = [] + @header_converters = [] + options.delete(:unconverted_fields) + options.delete(field_name) + end end include Csvlint::ErrorCollector