Skip to content

Commit

Permalink
Merge pull request #33 from theodi/allow-new-lines
Browse files Browse the repository at this point in the history
Change from parsing CSV line by line to using CSV.new and trapping errors
  • Loading branch information
pezholio committed Jan 17, 2014
2 parents 6e1a103 + 3462ac0 commit b495b5b
Show file tree
Hide file tree
Showing 11 changed files with 167 additions and 63 deletions.
7 changes: 2 additions & 5 deletions features/csv_options.feature
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,8 @@ Feature: CSV options
And that warning should have the type "check_options"
Scenario: Use esoteric line endings
Given I have a CSV with the following content:
"""
"Foo","Bar","Baz"|"1","2","3"|"3","2","1"
"""
And I set the line endings to "|"
Given I have a CSV file called "windows-line-endings.csv"
And I set the line endings to windows
And it is stored at the url "http://example.com/example1.csv"
When I ask if the CSV is valid
Then I should get the value of true
Expand Down
2 changes: 2 additions & 0 deletions features/fixtures/windows-line-endings.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
a,b,c
d,e,f
24 changes: 24 additions & 0 deletions features/parse_csv.feature
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,30 @@ Feature: Parse CSV
When I ask if the CSV is valid
Then I should get the value of true

Scenario: Successfully parse a CSV with newlines in quoted fields
Given I have a CSV with the following content:
"""
"a","b","c"
"d","e","this is
valid"
"a","b","c"
"""
And it is stored at the url "http://example.com/example1.csv"
When I ask if the CSV is valid
Then I should get the value of true

Scenario: Successfully parse a CSV with multiple newlines in quoted fields
Given I have a CSV with the following content:
"""
"a","b","c"
"d","this is
valid","as is this
too"
"""
And it is stored at the url "http://example.com/example1.csv"
When I ask if the CSV is valid
Then I should get the value of true

Scenario: Successfully report an invalid CSV
Given I have a CSV with the following content:
"""
Expand Down
10 changes: 5 additions & 5 deletions features/step_definitions/csv_options_steps.rb
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
Given(/^I set the delimiter to "(.*?)"$/) do |delimiter|
@csv_options ||= {}
@csv_options ||= default_csv_options
@csv_options["delimiter"] = delimiter
end

Given(/^I set quotechar to "(.*?)"$/) do |doublequote|
@csv_options ||= {}
@csv_options ||= default_csv_options
@csv_options["quotechar"] = doublequote
end

Given(/^I set the line endings to "(.*?)"$/) do |arg1|
@csv_options ||= {}
@csv_options["lineterminator"] = "|"
Given(/^I set the line endings to windows$/) do
@csv_options ||= default_csv_options
@csv_options["lineterminator"] = "\r\n"
end
4 changes: 2 additions & 2 deletions features/step_definitions/information_steps.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
end

Then(/^the "(.*?)" should be "(.*?)"$/) do |type, encoding|
validator = Csvlint::Validator.new( @url )
validator = Csvlint::Validator.new( @url, default_csv_options )
validator.send(type.to_sym).should == encoding
end

Then(/^the metadata content type should be "(.*?)"$/) do |content_type|
validator = Csvlint::Validator.new( @url )
validator = Csvlint::Validator.new( @url, default_csv_options )
validator.headers['content-type'].should == content_type
end
1 change: 1 addition & 0 deletions features/step_definitions/parse_csv_steps.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
end

When(/^I ask if the CSV is valid$/) do
@csv_options ||= default_csv_options
@validator = Csvlint::Validator.new( @url, @csv_options )
@valid = @validator.valid?
end
Expand Down
7 changes: 6 additions & 1 deletion features/step_definitions/validation_errors_steps.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
When(/^I ask if there are errors$/) do
@validator = Csvlint::Validator.new( @url )
@csv_options ||= default_csv_options
@validator = Csvlint::Validator.new( @url, @csv_options )
@errors = @validator.errors
end

Expand All @@ -26,4 +27,8 @@
Given(/^I have a CSV that doesn't exist$/) do
@url = "http//www.example.com/fake-csv.csv"
stub_request(:get, @url).to_return(:status => 404)
end

Then(/^there should be no "(.*?)" errors$/) do |type|
@errors.each do |error| error.type.should_not == type.to_sym end
end
1 change: 1 addition & 0 deletions features/step_definitions/validation_warnings_steps.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
end

When(/^I ask if there are warnings$/) do
@csv_options ||= default_csv_options
@validator = Csvlint::Validator.new( @url, @csv_options )
@warnings = @validator.warnings
end
Expand Down
12 changes: 12 additions & 0 deletions features/support/env.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,16 @@

Spork.each_run do
require 'csvlint'
end

class CustomWorld
def default_csv_options
return {
"lineterminator" => "\n"
}
end
end

World do
CustomWorld.new
end
26 changes: 19 additions & 7 deletions features/validation_errors.feature
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Feature: Get validation errors
And that error should have the row "2"
And that error should have the content ""","","
Scenario: Successfully report a CSV with trailing empty row
Scenario: Successfully report a CSV with multiple trailing empty rows
Given I have a CSV with the following content:
"""
"Foo","Bar","Baz"
Expand All @@ -64,25 +64,37 @@ Feature: Get validation errors
Then there should be 1 error
And that error should have the type "blank_rows"
And that error should have the row "3"
And that error should have no content
Scenario: Successfully report a CSV with an empty row
Given I have a CSV with the following content:
"""
"Foo","Bar","Baz"

"Foo","Bar","Baz"
"""
And it is stored at the url "http://example.com/example1.csv"
When I ask if there are errors
Then there should be 1 error
And that error should have the type "blank_rows"
And that error should have the row "2"
Scenario: Report invalid Encoding
Scenario: Report invalid Encoding
Given I have a CSV file called "invalid-byte-sequence.csv"
And I set an encoding header of "UTF-8"
And it is stored at the url "http://example.com/example1.csv"
When I ask if there are errors
Then there should be 1 error
Then there should be 4 error
And that error should have the type "invalid_encoding"
Scenario: Correctly handle different encodings
Scenario: Correctly handle different encodings
Given I have a CSV file called "invalid-byte-sequence.csv"
And I set an encoding header of "ISO-8859-1"
And it is stored at the url "http://example.com/example1.csv"
When I ask if there are errors
Then there should be 0 error
Then there should be no "content_encoding" errors
Scenario: Report invalid file
Scenario: Report invalid file
Given I have a CSV file called "spreadsheet.xls"
And it is stored at the url "http://example.com/example1.csv"
When I ask if there are errors
Expand Down
136 changes: 93 additions & 43 deletions lib/csvlint/validate.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,53 +26,71 @@ def valid?
end

def validate
expected_columns = 0
current_line = 0
single_col = false
reported_invalid_encoding = false
single_col = false
begin
open(@url) do |s|
@encoding = s.charset rescue nil
@content_type = s.content_type rescue nil
@headers = s.meta
if @headers["content-type"] !~ /charset=/
build_warnings(:no_encoding, nil)
else
build_warnings(:encoding, nil) if @encoding != "utf-8"
end
build_warnings(:no_content_type, nil) if @content_type == nil
build_warnings(:excel, nil) if @content_type == nil && @extension =~ /.xls(x)?/
build_errors(:wrong_content_type, nil) unless (@content_type && @content_type =~ /text\/csv/)
s.each_line(@line_terminator) do |line|
begin
current_line = current_line + 1
@csv_options[:encoding] = @encoding
row = CSV.parse(line.chomp(@line_terminator), @csv_options)[0]
if row
build_formats(row, current_line)
single_col = true if row.count == 1
expected_columns = row.count unless expected_columns != 0
build_errors(:ragged_rows, current_line, line) if row.count != expected_columns
build_errors(:blank_rows, current_line, line) if row.reject{ |c| c.nil? || c.empty? }.count == 0
else
build_errors(:blank_rows, current_line, nil)
end
rescue CSV::MalformedCSVError => e
type = fetch_error(e)
build_errors(type, current_line, line)
rescue ArgumentError => ae
build_errors(:invalid_encoding, current_line, line) unless reported_invalid_encoding
reported_invalid_encoding = true
end
end
open(@url) do |io|
validate_metadata(io)
columns = parse_csv(io)
build_warnings(:check_options, nil) if columns == 1
end
check_consistency
build_warnings(:check_options, nil) if single_col == true
rescue OpenURI::HTTPError, Errno::ENOENT
build_errors(:not_found, nil)
end
#binding.pry
end

def validate_metadata(io)
@encoding = io.charset rescue nil
@content_type = io.content_type rescue nil
@headers = io.meta
if @headers["content-type"] !~ /charset=/
build_warnings(:no_encoding, nil)
else
build_warnings(:encoding, nil) if @encoding != "utf-8"
end
build_warnings(:no_content_type, nil) if @content_type == nil
build_warnings(:excel, nil) if @content_type == nil && @extension =~ /.xls(x)?/
build_errors(:wrong_content_type, nil) unless (@content_type && @content_type =~ /text\/csv/)
end

def parse_csv(io)
expected_columns = 0
current_line = 0
reported_invalid_encoding = false

@csv_options[:encoding] = @encoding

wrapper = WrappedIO.new( io )
csv = CSV.new( wrapper , @csv_options )
row = nil
loop do
current_line = current_line + 1
begin
row = csv.shift
wrapper.finished
if row
build_formats(row, current_line)
expected_columns = row.count unless expected_columns != 0
build_errors(:ragged_rows, current_line, wrapper.line) if !row.empty? && row.count != expected_columns
build_errors(:blank_rows, current_line, wrapper.line) if row.reject{ |c| c.nil? || c.empty? }.count == 0
else
break
end
rescue CSV::MalformedCSVError => e
wrapper.finished
type = fetch_error(e)
build_errors(type, current_line, wrapper.line)
rescue ArgumentError => ae
wrapper.finished
build_errors(:invalid_encoding, current_line, wrapper.line) unless reported_invalid_encoding
reported_invalid_encoding = true
end
end
return expected_columns
end


def build_message(type, row, content)
Csvlint::ErrorMessage.new({
:type => type,
Expand All @@ -90,20 +108,22 @@ def build_warnings(type, row = nil, content = nil)
end

def fetch_error(error)
e = error.message.match(/^([a-z ]+) (i|o)n line ([0-9]+)\.$/i)
return :quoting if error.message.start_with?("Unquoted fields do not allow")
e = error.message.match(/^([a-z ]+) (i|o)n line ([0-9]+)\.?$/i)
ERROR_MATCHERS.fetch(e[1], :unknown_error)
end

def dialect_to_csv_options(dialect)
return {} unless dialect
#supplying defaults here just in case the dialect is invalid
dialect ||= {}
#supplying defaults here just in case the dialect is invalid
delimiter = dialect["delimiter"] || ","
skipinitialspace = dialect["skipinitialspace"] || true
delimiter = delimiter + " " if !skipinitialspace
return {
:col_sep => delimiter,
:row_sep => ( dialect["lineterminator"] || nil ),
:row_sep => ( dialect["lineterminator"] || "\r\n" ),
:quote_char => ( dialect["quotechar"] || '"'),
:skip_blanks => false
}
end

Expand Down Expand Up @@ -149,4 +169,34 @@ def parse_extension(url)
end

end


class WrappedIO
def initialize(io)
@io = io
@line = ""
end

def gets(delim)
@line = "" if @new_line
s = @io.gets(delim)
if s != nil
@line << s
end
return s
end

def eof?
@io.eof?
end

def finished
@new_line = true
end

def line
@line
end

end
end

0 comments on commit b495b5b

Please sign in to comment.