From deb87b9ba9a939ff8a30551e0c9a3f20777ea551 Mon Sep 17 00:00:00 2001 From: Shuhei Kadowaki Date: Wed, 5 Oct 2022 21:49:35 +0900 Subject: [PATCH] improve performance issue of `@nospecialize`-d keyword func call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit tries to fix and improve performance for calling keyword funcs whose arguments types are not fully known but `@nospecialize`-d. The final result would look like (this particular example is taken from our Julia-level compiler implementation): ```julia abstract type CallInfo end struct NoCallInfo <: CallInfo end struct NewInstruction stmt::Any type::Any info::CallInfo line::Union{Int32,Nothing} # if nothing, copy the line from previous statement in the insertion location flag::Union{UInt8,Nothing} # if nothing, IR flags will be recomputed on insertion function NewInstruction(@nospecialize(stmt), @nospecialize(type), @nospecialize(info::CallInfo), line::Union{Int32,Nothing}, flag::Union{UInt8,Nothing}) return new(stmt, type, info, line, flag) end end @nospecialize function NewInstruction(newinst::NewInstruction; stmt=newinst.stmt, type=newinst.type, info::CallInfo=newinst.info, line::Union{Int32,Nothing}=newinst.line, flag::Union{UInt8,Nothing}=newinst.flag) return NewInstruction(stmt, type, info, line, flag) end @specialize using BenchmarkTools struct VirtualKwargs stmt::Any type::Any info::CallInfo end vkws = VirtualKwargs(nothing, Any, NoCallInfo()) newinst = NewInstruction(nothing, Any, NoCallInfo(), nothing, nothing) runner(newinst, vkws) = NewInstruction(newinst; vkws.stmt, vkws.type, vkws.info) @benchmark runner($newinst, $vkws) ``` > on master ``` BenchmarkTools.Trial: 10000 samples with 186 evaluations. Range (min … max): 559.898 ns … 4.173 μs ┊ GC (min … max): 0.00% … 85.29% Time (median): 605.608 ns ┊ GC (median): 0.00% Time (mean ± σ): 638.170 ns ± 125.080 ns ┊ GC (mean ± σ): 0.06% ± 0.85% █▇▂▆▄ ▁█▇▄▂ ▂ ██████▅██████▇▇▇██████▇▇▇▆▆▅▄▅▄▂▄▄▅▇▆▆▆▆▆▅▆▆▄▄▅▅▄▃▄▄▄▅▃▅▅▆▅▆▆ █ 560 ns Histogram: log(frequency) by time 1.23 μs < Memory estimate: 32 bytes, allocs estimate: 2. ``` > on this commit ```julia BenchmarkTools.Trial: 10000 samples with 1000 evaluations. Range (min … max): 3.080 ns … 83.177 ns ┊ GC (min … max): 0.00% … 0.00% Time (median): 3.098 ns ┊ GC (median): 0.00% Time (mean ± σ): 3.118 ns ± 0.885 ns ┊ GC (mean ± σ): 0.00% ± 0.00% ▂▅▇█▆▅▄▂ ▂▄▆▆▇████████▆▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▂▂▁▂▂▂▂▂▂▁▁▂▁▂▂▂▂▂▂▂▂▂ ▃ 3.08 ns Histogram: frequency by time 3.19 ns < Memory estimate: 0 bytes, allocs estimate: 0. ``` So for this particular case it achieves roughly 200x speed up. This is because this commit allows inlining of a call to keyword sorter as well as removal of `NamedTuple` call. Especially this commit is composed of the following improvements: - add early return case for `structdiff`: This change improves the return type inference for a case when compared `NamedTuple`s are type unstable but there is no difference in their names, e.g. given two `NamedTuple{(:a,:b),T} where T<:Tuple{Any,Any}`s. And in such case the optimizer will remove `structdiff` and succeeding `pairs` calls, letting the keyword sorter to be inlined. - add special SROA handling for `NamedTuple` generated by keyword sorter: With the change on `structdiff`, IR for a call with type-unstable keyword arguments after inlining would look like: ``` %1 = tuple(a, b, c)::Tuple{Any, Any, Any} %2 = NamedTuple{(:a, :b, :c)(%1)::NamedTuple{(:a, :b, :c), _A} where _A<:Tuple{Any, Any, Any} %3 = Core.getfield(%2, :a)::Any %4 = Core.getfield(%2, :b)::Any %5 = Core.getfield(%2, :c)::Any [... other body of the keyword func ...] ``` We can implement a bit hacky special handling within our SROA pass that checks if this definition (%2) is partly well-known `NamedTuple` construction, where its names are fully known, and also checks if its call argument (%1) is fully-known `tuple` call. In a case when the length of the `NamedTuple` names and the length of the arguments for the `tuple` call, we can safely replace those `getfield` calls with the corresponding `tuple` call argument, while letting the later DCE pass to delete the constructions of tuple and named-tuple altogether. With these changes, the IR for the example `NewInstruction` constructor is fairly optimized, like: ```julia julia> Base.code_ircode((NewInstruction,Any,Any,CallInfo)) do newinst, stmt, type, info NewInstruction(newinst; stmt, type, info) end |> only 2 1 ── %1 = Base.getfield(_2, :line)::Union{Nothing, Int32} │╻╷ Type##kw │ %2 = Base.getfield(_2, :flag)::Union{Nothing, UInt8} ││┃ getproperty │ %3 = (isa)(%1, Nothing)::Bool ││ │ %4 = (isa)(%2, Nothing)::Bool ││ │ %5 = (Core.Intrinsics.and_int)(%3, %4)::Bool ││ └─── goto #3 if not %5 ││ 2 ── %7 = %new(Main.NewInstruction, _3, _4, _5, nothing, nothing)::NewInstruction NewInstruction └─── goto #10 ││ 3 ── %9 = (isa)(%1, Int32)::Bool ││ │ %10 = (isa)(%2, Nothing)::Bool ││ │ %11 = (Core.Intrinsics.and_int)(%9, %10)::Bool ││ └─── goto #5 if not %11 ││ 4 ── %13 = π (%1, Int32) ││ │ %14 = %new(Main.NewInstruction, _3, _4, _5, %13, nothing)::NewInstruction│││╻ NewInstruction └─── goto #10 ││ 5 ── %16 = (isa)(%1, Nothing)::Bool ││ │ %17 = (isa)(%2, UInt8)::Bool ││ │ %18 = (Core.Intrinsics.and_int)(%16, %17)::Bool ││ └─── goto #7 if not %18 ││ 6 ── %20 = π (%2, UInt8) ││ │ %21 = %new(Main.NewInstruction, _3, _4, _5, nothing, %20)::NewInstruction│││╻ NewInstruction └─── goto #10 ││ 7 ── %23 = (isa)(%1, Int32)::Bool ││ │ %24 = (isa)(%2, UInt8)::Bool ││ │ %25 = (Core.Intrinsics.and_int)(%23, %24)::Bool ││ └─── goto #9 if not %25 ││ 8 ── %27 = π (%1, Int32) ││ │ %28 = π (%2, UInt8) ││ │ %29 = %new(Main.NewInstruction, _3, _4, _5, %27, %28)::NewInstruction │││╻ NewInstruction └─── goto #10 ││ 9 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} └─── unreachable ││ 10 ┄ %33 = φ (#2 => %7, #4 => %14, #6 => %21, #8 => %29)::NewInstruction ││ └─── goto #11 ││ 11 ─ return %33 │ => NewInstruction ``` --- base/compiler/ssair/passes.jl | 39 ++++++++++++++++++++++++++++++++++- base/compiler/tfuncs.jl | 5 +++++ base/namedtuple.jl | 11 +++++++--- test/compiler/inference.jl | 6 ++++++ 4 files changed, 57 insertions(+), 4 deletions(-) diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl index 8ad72285fcaeb3..7932f61e8de8aa 100644 --- a/base/compiler/ssair/passes.jl +++ b/base/compiler/ssair/passes.jl @@ -415,6 +415,43 @@ function lift_leaves(compact::IncrementalCompact, return nothing else typ = argextype(leaf, compact) + + # Here we implement a bit hacky special case to optimize a function call with + # type-unstable but `@nospecialize`-d keyword arguments, whose IR at this point + # would look like: + # ``` + # %1 = tuple(a, b, c)::Tuple{Any, Any, Any} + # %2 = NamedTuple{(:a, :b, :c)(%1)::NamedTuple{(:a, :b, :c), _A} where _A<:Tuple{Any, Any, Any} + # %3 = Core.getfield(%2, :a)::Any + # %4 = Core.getfield(%2, :b)::Any + # %5 = Core.getfield(%2, :c)::Any + # [... other body of the keyword func ...] + # ``` + # First, we check if this definition (%2) is partly well-known + # `NamedTuple` construction, where its names are fully known, and also + # if its call argument (%1) is fully-known `tuple` call. + # In a case when the length of the `NamedTuple` names and the length of + # the arguments for the `tuple` call, we can safely replace those `getfield` + # calls with the corresponding `tuple` call argument, while letting the + # later DCE pass to delete the constructions of tuple and named-tuple + typ′ = unwrap_unionall(widenconst(typ)) + if isa(typ′, DataType) && typ′.name === _NAMEDTUPLE_NAME && length(typ′.parameters) == 2 + ns = typ′.parameters[1] + if is_known_call(def, NamedTuple{ns}, compact) && length(def.args) == 2 + arg = def.args[2] + if isa(arg, AnySSAValue) + argexpr = compact[arg][:inst] + if is_known_call(argexpr, tuple, compact) && length(ns) == length(argexpr.args)-1 + # ok, we know this NamedTuple construction is nothrow, + # let's mark this NamedTuple as DCE-eligible + compact[leaf::AnySSAValue][:flag] |= IR_FLAG_EFFECT_FREE + lift_arg!(compact, argexpr.args[field+1], cache_key, argexpr, field+1, lifted_leaves) + continue + end + end + end + end + if !isa(typ, Const) # TODO: (disabled since #27126) # If the leaf is an old ssa value, insert a getfield here @@ -469,7 +506,7 @@ function lift_arg!( end end lifted_leaves[cache_key] = LiftedValue(lifted) - nothing + return nothing end function walk_to_def(compact::IncrementalCompact, @nospecialize(leaf)) diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl index 9c206270976b8d..e6750b8c9000c7 100644 --- a/base/compiler/tfuncs.jl +++ b/base/compiler/tfuncs.jl @@ -1594,6 +1594,11 @@ function apply_type_tfunc(@nospecialize(headtypetype), @nospecialize args...) end if istuple return Type{<:appl} + elseif isa(appl, DataType) && appl.name === _NAMEDTUPLE_NAME && appl.parameters[1] === () + # if the first parameter of `NamedTuple` is known to be empty tuple, + # the second argument should also be empty tuple type, + # so refine it here + return Const(NamedTuple{(),Tuple{}}) end ans = Type{appl} for i = length(outervars):-1:1 diff --git a/base/namedtuple.jl b/base/namedtuple.jl index 3e9f1272d588e4..b53304de7d8cc4 100644 --- a/base/namedtuple.jl +++ b/base/namedtuple.jl @@ -335,7 +335,7 @@ reverse(nt::NamedTuple) = NamedTuple{reverse(keys(nt))}(reverse(values(nt))) end """ - structdiff(a::NamedTuple{an}, b::Union{NamedTuple{bn},Type{NamedTuple{bn}}}) where {an,bn} + structdiff(a::NamedTuple, b::Union{NamedTuple,Type{NamedTuple}}) Construct a copy of named tuple `a`, except with fields that exist in `b` removed. `b` can be a named tuple, or a type of the form `NamedTuple{field_names}`. @@ -343,14 +343,19 @@ Construct a copy of named tuple `a`, except with fields that exist in `b` remove function structdiff(a::NamedTuple{an}, b::Union{NamedTuple{bn}, Type{NamedTuple{bn}}}) where {an, bn} if @generated names = diff_names(an, bn) + isempty(names) && return (;) # just a fast pass idx = Int[ fieldindex(a, names[n]) for n in 1:length(names) ] types = Tuple{Any[ fieldtype(a, idx[n]) for n in 1:length(idx) ]...} vals = Any[ :(getfield(a, $(idx[n]))) for n in 1:length(idx) ] - :( NamedTuple{$names,$types}(($(vals...),)) ) + return :( NamedTuple{$names,$types}(($(vals...),)) ) else names = diff_names(an, bn) + # N.B this early return is necessary to get a better type stability, + # and also allows us to cut off the cost from constructing + # potentially type unstable closure passed to the `map` below + isempty(names) && return (;) types = Tuple{Any[ fieldtype(typeof(a), names[n]) for n in 1:length(names) ]...} - NamedTuple{names,types}(map(Fix1(getfield, a), names)) + return NamedTuple{names,types}(map(n::Symbol->getfield(a, n), names)) end end diff --git a/test/compiler/inference.jl b/test/compiler/inference.jl index dcd57916589cf4..d6f26ab2f899aa 100644 --- a/test/compiler/inference.jl +++ b/test/compiler/inference.jl @@ -2336,6 +2336,12 @@ end # Equivalence of Const(T.instance) and T for singleton types @test Const(nothing) ⊑ Nothing && Nothing ⊑ Const(nothing) +# `apply_type_tfunc` should always return accurate result for empty NamedTuple case +import Core: Const +import Core.Compiler: apply_type_tfunc +@test apply_type_tfunc(Const(NamedTuple), Const(()), Type{T} where T<:Tuple{}) === Const(typeof((;))) +@test apply_type_tfunc(Const(NamedTuple), Const(()), Type{T} where T<:Tuple) === Const(typeof((;))) + # Don't pessimize apply_type to anything worse than Type and yield Bottom for invalid Unions @test Core.Compiler.return_type(Core.apply_type, Tuple{Type{Union}}) == Type{Union{}} @test Core.Compiler.return_type(Core.apply_type, Tuple{Type{Union},Any}) == Type