Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to markdown parser following trials on Julia Blog posts #218

Merged
merged 3 commits into from
Sep 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/converter/fixer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function find_and_fix_md_links(hs::String)::String
# the regexes very readable...

# here we're looking for [id]: link; 1=id 2=link
m_link_defs = collect(eachmatch(r"&#91;((?:(?!&#93;).)*?)&#93;:\s((?:(?!\<\/p\>)\S)+)", hs))
m_link_defs = collect(eachmatch(r"&#91;((?:(?!&#93;).)*?)&#93;:\s+((?:(?!\<\/p\>)\S)+)", hs))

def_names = [def.captures[1] for def in m_link_defs]
def_links = [def.captures[2] for def in m_link_defs]
Expand Down
6 changes: 4 additions & 2 deletions src/converter/md.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,11 @@ function convert_md(mds::String, pre_lxdefs::Vector{LxDef}=Vector{LxDef}();
#> 2. Open-Close blocks (OCBlocks)
#>> a. find them
blocks, tokens = find_all_ocblocks(tokens, MD_OCB_ALL)
#>> b. now that blocks have been found, line-returns can be dropped
#>> b. merge CODE_BLOCK_IND which are separated by emptyness
merge_indented_code_blocks!(blocks, mds)
#>> c. now that blocks have been found, line-returns can be dropped
filter!(τ -> τ.name ∉ L_RETURNS, tokens)
#>> c. filter out "fake headers" (opening ### that are not at the start of a line)
#>> d. filter out "fake headers" (opening ### that are not at the start of a line)
filter!(β -> validate_header_block(β), blocks)

#> 3. LaTeX commands
Expand Down
6 changes: 3 additions & 3 deletions src/converter/md_blocks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ function convert_block(β::AbstractBlock, lxcontext::LxContext)::AbstractString
# Return relevant interpolated string based on case
βn = β.name
βn ∈ MD_HEADER && return convert_header(β)
βn == :CODE_INLINE && return md2html(β.ss; stripp=true, code=true)
βn == :CODE_INLINE && return html_code_inline(content(β) |> Markdown.htmlesc)
βn == :CODE_BLOCK_LANG && return convert_code_block(β.ss)
βn == :CODE_BLOCK_IND && return convert_indented_code_block(β.ss)
βn == :CODE_BLOCK && return md2html(β.ss; code=true)
βn == :CODE_BLOCK && return md2html(β.ss)
βn == :ESCAPE && return chop(β.ss, head=3, tail=3)

# Math block --> needs to call further processing to resolve possible latex
Expand Down Expand Up @@ -185,5 +185,5 @@ function convert_indented_code_block(ss::SubString)::String
# 1. decrease indentation of all lines (either frontal \n\t or \n⎵⎵⎵⎵)
code = replace(ss, r"\n(?:\t| {4})" => "\n")
# 2. return; lang is a LOCAL_PAGE_VARS that is julia by default and can be set
return html_code(code, "{{fill lang}}")
return html_code(strip(code), "{{fill lang}}")
end
6 changes: 2 additions & 4 deletions src/converter/md_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@ that don't need to be further considered and don't contain anything else than ma
The boolean `stripp` indicates whether to remove the inserted `<p>` and `</p>` by the base markdown
processor, this is relevant for things that are parsed within latex commands etc.
"""
function md2html(ss::AbstractString; stripp::Bool=false, code::Bool=false)::AbstractString

function md2html(ss::AbstractString; stripp::Bool=false)::AbstractString
# if there's nothing, return that...
isempty(ss) && return ss

# Use Julia's Markdown parser followed by Julia's MD->HTML conversion
partial = ss |> fix_inserts |> Markdown.parse |> Markdown.html

# In some cases, base converter adds <p>...</p>\n which we might not want
stripp || return partial
startswith(partial, "<p>") && (partial = chop(partial, head=3))
Expand Down
9 changes: 8 additions & 1 deletion src/misc_html.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ html_img(src::AbstractString, alt::AbstractString="") =
"""
$(SIGNATURES)

Convenience function to introduce an image.
Convenience function to introduce a code block.
"""
function html_code(c::AbstractString, lang::AbstractString="")
isempty(c) && return ""
Expand All @@ -43,6 +43,13 @@ end
"""
$(SIGNATURES)

Convenience function to introduce inline code.
"""
html_code_inline(c::AbstractString) = "<code>$c</code>"

"""
$(SIGNATURES)

Insertion of a visible red message in HTML to show there was a problem.
"""
html_err(mess::String="") = "<p><span style=\"color:red;\">// $mess //</span></p>"
59 changes: 56 additions & 3 deletions src/parser/ocblocks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ Find active blocks between an opening token (`otoken`) and a closing token `ctok
nested (e.g. braces). Return the list of such blocks. If `deactivate` is `true`, all the tokens
within the block will be marked as inactive (for further, separate processing).
"""
function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto; inmath=false)
function find_ocblocks(tokens::Vector{Token}, ocproto::OCProto;
inmath=false)::Tuple{Vector{OCBlock}, Vector{Token}}

ntokens = length(tokens)
active_tokens = ones(Bool, length(tokens))
Expand Down Expand Up @@ -127,8 +128,8 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token}
# blocks.
for i in 1:length(lr_idx)-1
# capture start and finish of the line (from line return to line return)
start = from(tokens[lr_idx[i]]) # first :LINE_RETURN
finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN
start = from(tokens[lr_idx[i]]) # first :LINE_RETURN
finish = from(tokens[lr_idx[i+1]]) # next :LINE_RETURN
line = subs(st, start, finish)
indent = ""
if startswith(line, "\n ")
Expand All @@ -155,3 +156,55 @@ function find_indented_blocks(tokens::Vector{Token}, st::String)::Vector{Token}
end
return tokens
end


"""
$SIGNATURES

When two indented code blocks follow each other and there's nothing in between (empty line(s)),
merge them into a super block.
"""
function merge_indented_code_blocks!(blocks::Vector{OCBlock}, mds::String)::Nothing
# indices of CODE_BLOCK_IND
idx = [i for i in eachindex(blocks) if blocks[i].name == :CODE_BLOCK_IND]
isempty(idx) && return
# check if they're separated by something or nothing
inter_space = [(subs(mds, to(blocks[idx[i]]), from(blocks[idx[i+1]])) |> strip |> length) > 0
for i in 1:length(idx)-1]

curseq = Int[] # to keep track of current list of blocks to merge
del_blocks = Int[] # to keep track of blocks that will be removed afterwards

# if there's no inter_space, add to the list, if there is, close and merge
for i in eachindex(inter_space)
if inter_space[i] && !isempty(curseq)
# close and merge all in curseq and empty curseq
form_super_block!(blocks, idx, curseq, del_blocks)
elseif !inter_space[i]
push!(curseq, i)
end
end
!isempty(curseq) && form_super_block!(blocks, idx, curseq, del_blocks)
# remove the blocks that have been merged
deleteat!(blocks, del_blocks)
return
end


"""
$SIGNATURES

Helper function to [`merge_indented_code_blocks`](@ref).
"""
function form_super_block!(blocks::Vector{OCBlock}, idx::Vector{Int},
curseq::Vector{Int}, del_blocks::Vector{Int})::Nothing
push!(curseq, curseq[end]+1)
first_block = blocks[idx[curseq[1]]]
last_block = blocks[idx[curseq[end]]]
# replace the first block with the super block
blocks[idx[curseq[1]]] = OCBlock(:CODE_BLOCK_IND, (otok(first_block) => ctok(last_block)))
# append all blocks but the first to the delete list
append!(del_blocks, curseq[2:end])
empty!(curseq)
return
end
49 changes: 48 additions & 1 deletion test/converter/markdown3.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ end
tokens, = steps[:tokenization]
@test tokens[7].name == :CHAR_LINEBREAK
h = st |> seval
@test isapproxstr(st |> seval, """
@test isapproxstr(st |> seval, raw"""
<p>Hello &#92; blah &#92; end
and <code>B \ c</code> end <br/> and
<pre><code>A \ b</code></pre>
Expand Down Expand Up @@ -285,4 +285,51 @@ end
</ul>
<p>end</p>
""")

st = raw"""
A

function foo()

return 2

end

function bar()
return 3
end

B

function baz()
return 5

end

C
""" * J.EOS
isapproxstr(st |> seval, raw"""
<p>A <pre><code class="language-julia">function foo()

return 2

end

function bar()
return 3
end</code></pre>
B <pre><code class="language-julia">function baz()
return 5

end</code></pre>
C</p>
""")
end


@testset "More ``" begin
st = raw"""
A ``blah``.
""" * J.EOS
isapproxstr(st |> seval, """<p>A <code>blah</code>.</p>""")
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ println("🍺")
println("CONVERTER/MD")
include("converter/markdown.jl")
include("converter/markdown2.jl")
include("converter/markdown3.jl")
include("converter/hyperref.jl")
println("🍺")

Expand Down
1 change: 1 addition & 0 deletions test/test_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ function explore_md_steps(mds)

# tokenize
tokens = J.find_tokens(mds, J.MD_TOKENS, J.MD_1C_TOKENS)
tokens = J.find_indented_blocks(tokens, mds)
steps[:tokenization] = (tokens=tokens,)

# ocblocks
Expand Down