diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..be94e6f --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +3.2.2 diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000..0b2d858 --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +ruby 3.1.2 diff --git a/lib/word-to-markdown.rb b/lib/word-to-markdown.rb index 6d57a47..c44ca21 100644 --- a/lib/word-to-markdown.rb +++ b/lib/word-to-markdown.rb @@ -14,6 +14,7 @@ require_relative 'word-to-markdown/version' require_relative 'word-to-markdown/document' require_relative 'word-to-markdown/converter' +require_relative 'word-to-markdown/pandoc-converter' require_relative 'nokogiri/xml/element' require_relative 'cliver/dependency_ext' @@ -43,9 +44,15 @@ class WordToMarkdown # @param path [string] Path to the Word document # @param tmpdir [string] Path to a working directory to use # @return [WordToMarkdown] WordToMarkdown object with the converted document - def initialize(path, tmpdir = nil) + def initialize(path, tmpdir = nil, use_pandoc = false) @document = WordToMarkdown::Document.new path, tmpdir - @converter = WordToMarkdown::Converter.new @document + + @converter = if use_pandoc + WordToMarkdown::PandocConverter.new @document + else + WordToMarkdown::Converter.new @document + end + converter.convert! end diff --git a/lib/word-to-markdown/document.rb b/lib/word-to-markdown/document.rb index 997f230..6df608a 100644 --- a/lib/word-to-markdown/document.rb +++ b/lib/word-to-markdown/document.rb @@ -7,6 +7,7 @@ class NotFoundError < StandardError; end class ConversionError < StandardError; end attr_reader :path, :tmpdir + attr_writer :markdown, :raw_html # @param path [string] Path to the Word document # @param tmpdir [string] Path to a working directory to use diff --git a/lib/word-to-markdown/pandoc-converter.rb b/lib/word-to-markdown/pandoc-converter.rb new file mode 100644 index 0000000..c287c51 --- /dev/null +++ b/lib/word-to-markdown/pandoc-converter.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +class WordToMarkdown + class PandocConverter + attr_reader :document + + # @param document [WordToMarkdown::Document] The document to convert + def initialize(document) + @document = document + end + + def convert! + document.raw_html = pandoc.to_html + #raw_markdown = pandoc.to_markdown # NOTE: Try GFM, CommonMark, or + Extensions + #document.markdown = document.send(:scrub_whitespace, raw_markdown) + end + + private + + def pandoc + require 'pandoc-ruby' + @pandoc ||= PandocRuby.new([document.path], from: 'docx') + end + end +end diff --git a/test/helper.rb b/test/helper.rb index c101f65..6a4c5b4 100644 --- a/test/helper.rb +++ b/test/helper.rb @@ -18,7 +18,12 @@ def fixture_path(fixture = '') end def validate_fixture(fixture, expected) - assert_equal expected, WordToMarkdown.new(fixture_path(fixture)).to_s + path = fixture_path(fixture) + old = WordToMarkdown.new(path).to_s + assert_equal expected, old, "LibreOffice" + + new = WordToMarkdown.new(path, nil, true).to_s + assert_equal expected, new, "Pandoc" end def stub_doc(html) diff --git a/word-to-markdown.gemspec b/word-to-markdown.gemspec index c903aa7..1eac271 100644 --- a/word-to-markdown.gemspec +++ b/word-to-markdown.gemspec @@ -19,6 +19,7 @@ Gem::Specification.new do |s| s.add_dependency('cliver', '~> 0.3') s.add_dependency('descriptive_statistics', '~> 2.5') s.add_dependency('nokogiri-styles', '~> 0.1') + s.add_dependency('pandoc-ruby', '~> 2.0') s.add_dependency('premailer', '~> 1.8') s.add_dependency('reverse_markdown', '>= 1', '< 3') s.add_dependency('sys-proctable', '~> 1.0')