Skip to content

Commit

Permalink
Speed up #build_formats (changes its API)
Browse files Browse the repository at this point in the history
  • Loading branch information
James McKinney committed Nov 9, 2014
1 parent cf3dc46 commit 73710bc
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 196 deletions.
9 changes: 7 additions & 2 deletions lib/csvlint.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
require 'csv'
require 'date'
require 'open-uri'
require 'mime/types'
require 'set'
require 'tempfile'

require 'csvlint/types'
require 'active_support/core_ext/date/conversions'
require 'active_support/core_ext/time/conversions'
require 'mime/types'
require 'open_uri_redirections'

require 'csvlint/error_message'
require 'csvlint/error_collector'
require 'csvlint/validate'
Expand Down
69 changes: 68 additions & 1 deletion lib/csvlint/field.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ module Csvlint

class Field
include Csvlint::ErrorCollector
include Csvlint::Types

attr_reader :name, :constraints, :title, :description

Expand Down Expand Up @@ -98,5 +97,73 @@ def convert_to_type(value)
end
return parsed
end

TYPE_VALIDATIONS = {
'http://www.w3.org/2001/XMLSchema#string' => lambda { |value, constraints| value },
'http://www.w3.org/2001/XMLSchema#int' => lambda { |value, constraints| Integer value },
'http://www.w3.org/2001/XMLSchema#integer' => lambda { |value, constraints| Integer value },
'http://www.w3.org/2001/XMLSchema#float' => lambda { |value, constraints| Float value },
'http://www.w3.org/2001/XMLSchema#double' => lambda { |value, constraints| Float value },
'http://www.w3.org/2001/XMLSchema#anyURI' => lambda do |value, constraints|
u = URI.parse value
raise ArgumentError unless u.kind_of?(URI::HTTP) || u.kind_of?(URI::HTTPS)
u
end,
'http://www.w3.org/2001/XMLSchema#boolean' => lambda do |value, constraints|
return true if ['true', '1'].include? value
return false if ['false', '0'].include? value
raise ArgumentError
end,
'http://www.w3.org/2001/XMLSchema#nonPositiveInteger' => lambda do |value, constraints|
i = Integer value
raise ArgumentError unless i <= 0
i
end,
'http://www.w3.org/2001/XMLSchema#negativeInteger' => lambda do |value, constraints|
i = Integer value
raise ArgumentError unless i < 0
i
end,
'http://www.w3.org/2001/XMLSchema#nonNegativeInteger' => lambda do |value, constraints|
i = Integer value
raise ArgumentError unless i >= 0
i
end,
'http://www.w3.org/2001/XMLSchema#positiveInteger' => lambda do |value, constraints|
i = Integer value
raise ArgumentError unless i > 0
i
end,
'http://www.w3.org/2001/XMLSchema#dateTime' => lambda do |value, constraints|
date_pattern = constraints["datePattern"] || "%Y-%m-%dT%H:%M:%SZ"
d = DateTime.strptime(value, date_pattern)
raise ArgumentError unless d.strftime(date_pattern) == value
d
end,
'http://www.w3.org/2001/XMLSchema#date' => lambda do |value, constraints|
date_pattern = constraints["datePattern"] || "%Y-%m-%d"
d = Date.strptime(value, date_pattern)
raise ArgumentError unless d.strftime(date_pattern) == value
d
end,
'http://www.w3.org/2001/XMLSchema#time' => lambda do |value, constraints|
date_pattern = constraints["datePattern"] || "%H:%M:%S"
d = DateTime.strptime(value, date_pattern)
raise ArgumentError unless d.strftime(date_pattern) == value
d
end,
'http://www.w3.org/2001/XMLSchema#gYear' => lambda do |value, constraints|
date_pattern = constraints["datePattern"] || "%Y"
d = Date.strptime(value, date_pattern)
raise ArgumentError unless d.strftime(date_pattern) == value
d
end,
'http://www.w3.org/2001/XMLSchema#gYearMonth' => lambda do |value, constraints|
date_pattern = constraints["datePattern"] || "%Y-%m"
d = Date.strptime(value, date_pattern)
raise ArgumentError unless d.strftime(date_pattern) == value
d
end,
}
end
end
2 changes: 0 additions & 2 deletions lib/csvlint/schema.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
require "set"

module Csvlint

class Schema
Expand Down
137 changes: 0 additions & 137 deletions lib/csvlint/types.rb

This file was deleted.

108 changes: 79 additions & 29 deletions lib/csvlint/validate.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
require "open_uri_redirections"

module Csvlint

class Validator

include Csvlint::ErrorCollector
include Csvlint::Types

attr_reader :encoding, :content_type, :extension, :headers, :line_breaks, :dialect, :csv_header, :schema, :data

Expand Down Expand Up @@ -111,7 +108,7 @@ def parse_csv(io)
end
row = nil
loop do
current_line = current_line + 1
current_line += 1
begin
wrapper.reset_line
row = csv.shift
Expand All @@ -122,7 +119,7 @@ def parse_csv(io)
validate_header(row)
@col_counts << row.size
else
build_formats(row, current_line)
build_formats(row)
@col_counts << row.reject {|r| r.blank? }.size
@expected_columns = row.size unless @expected_columns != 0

Expand Down Expand Up @@ -195,40 +192,60 @@ def dialect_to_csv_options(dialect)
}
end

def build_formats(row, line)
def build_formats(row)
row.each_with_index do |col, i|
next if col.blank?
@formats[i] ||= []

SIMPLE_FORMATS.each do |type, lambda|
begin
if lambda.call(col)
@format = type
end
rescue ArgumentError, URI::InvalidURIError
@formats[i] ||= Hash.new(0)

format = if col.strip[FORMATS[:numeric]]
if col[FORMATS[:date_number]] && date_format?(Date, col, '%Y%m%d')
:date_number
elsif col[FORMATS[:dateTime_number]] && date_format?(Time, col, '%Y%m%d%H%M%S')
:dateTime_number
elsif col[FORMATS[:dateTime_nsec]] && date_format?(Time, col, '%Y%m%d%H%M%S%N')
:dateTime_nsec
else
:numeric
end
elsif uri?(col)
:uri
elsif col[FORMATS[:date_db]] && date_format?(Date, col, '%Y-%m-%d')
:date_db
elsif col[FORMATS[:date_short]] && date_format?(Date, col, '%e %b')
:date_short
elsif col[FORMATS[:date_rfc822]] && date_format?(Date, col, '%e %b %Y')
:date_rfc822
elsif col[FORMATS[:date_long]] && date_format?(Date, col, '%B %e, %Y')
:date_long
elsif col[FORMATS[:dateTime_time]] && date_format?(Time, col, '%H:%M')
:dateTime_time
elsif col[FORMATS[:dateTime_hms]] && date_format?(Time, col, '%H:%M:%S')
:dateTime_hms
elsif col[FORMATS[:dateTime_db]] && date_format?(Time, col, '%Y-%m-%d %H:%M:%S')
:dateTime_db
elsif col[FORMATS[:dateTime_iso8601]] && date_format?(Time, col, '%Y-%m-%dT%H:%M:%SZ')
:dateTime_iso8601
elsif col[FORMATS[:dateTime_short]] && date_format?(Time, col, '%d %b %H:%M')
:dateTime_short
elsif col[FORMATS[:dateTime_long]] && date_format?(Time, col, '%B %d, %Y %H:%M')
:dateTime_long
else
:string
end

@formats[i] << @format
@formats[i][format] += 1
end
end

def check_consistency
percentages = []

SIMPLE_FORMATS.keys.each do |type|
@formats.each_with_index do |format,i|
percentages[i] ||= {}
unless format.nil?
percentages[i][type] = format.count(type) / format.size.to_f
@formats.each_with_index do |format,i|
if format
total = format.values.reduce(:+).to_f
if format.none?{|_,count| count / total >= 0.9}
build_warnings(:inconsistent_values, :schema, nil, i + 1)
end
end
end

percentages.each_with_index do |col, i|
next if col.values.blank?
build_warnings(:inconsistent_values, :schema, nil, i+1) if col.values.max < 0.9
end
end

private
Expand All @@ -248,6 +265,39 @@ def parse_extension(source)
File.extname(parsed.path)
end
end


def uri?(value)
if value.strip[FORMATS[:uri]]
uri = URI.parse(value)
uri.kind_of?(URI::HTTP) || uri.kind_of?(URI::HTTPS)
end
rescue URI::InvalidURIError
false
end

def date_format?(klass, value, format)
klass.strptime(value, format).strftime(format) == value
rescue ArgumentError # invalid date
false
end

FORMATS = {
:string => nil,
:numeric => /\A[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?\z/,
:uri => /\Ahttps?:/,
:date_db => /\A\d{4,}-\d\d-\d\d\z/,
:date_long => /\A(?:#{Date::MONTHNAMES.join('|')}) [ \d]\d, \d{4,}\z/,
:date_number => /\A\d{8}\z/,
:date_rfc822 => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d{4,}\z/,
:date_short => /\A[ \d]\d (?:#{Date::ABBR_MONTHNAMES.join('|')})\z/,
:dateTime_db => /\A\d{4,}-\d\d-\d\d \d\d:\d\d:\d\d\z/,
:dateTime_hms => /\A\d\d:\d\d:\d\d\z/,
:dateTime_iso8601 => /\A\d{4,}-\d\d-\d\dT\d\d:\d\d:\d\dZ\z/,
:dateTime_long => /\A(?:#{Date::MONTHNAMES.join('|')}) \d\d, \d{4,} \d\d:\d\d\z/,
:dateTime_nsec => /\A\d{23}\z/,
:dateTime_number => /\A\d{14}\z/,
:dateTime_short => /\A\d\d (?:#{Date::ABBR_MONTHNAMES.join('|')}) \d\d:\d\d\z/,
:dateTime_time => /\A\d\d:\d\d\z/,
}.freeze
end
end
end
Loading

0 comments on commit 73710bc

Please sign in to comment.