From deb87b9ba9a939ff8a30551e0c9a3f20777ea551 Mon Sep 17 00:00:00 2001
From: Shuhei Kadowaki <aviatesk@gmail.com>
Date: Wed, 5 Oct 2022 21:49:35 +0900
Subject: [PATCH] improve performance issue of `@nospecialize`-d keyword func
 call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit tries to fix and improve performance for calling keyword
funcs whose arguments types are not fully known but `@nospecialize`-d.

The final result would look like (this particular example is taken from
our Julia-level compiler implementation):
```julia
abstract type CallInfo end
struct NoCallInfo <: CallInfo end
struct NewInstruction
    stmt::Any
    type::Any
    info::CallInfo
    line::Union{Int32,Nothing} # if nothing, copy the line from previous statement in the insertion location
    flag::Union{UInt8,Nothing} # if nothing, IR flags will be recomputed on insertion
    function NewInstruction(@nospecialize(stmt), @nospecialize(type), @nospecialize(info::CallInfo),
                            line::Union{Int32,Nothing}, flag::Union{UInt8,Nothing})
        return new(stmt, type, info, line, flag)
    end
end
@nospecialize
function NewInstruction(newinst::NewInstruction;
    stmt=newinst.stmt,
    type=newinst.type,
    info::CallInfo=newinst.info,
    line::Union{Int32,Nothing}=newinst.line,
    flag::Union{UInt8,Nothing}=newinst.flag)
    return NewInstruction(stmt, type, info, line, flag)
end
@specialize

using BenchmarkTools
struct VirtualKwargs
    stmt::Any
    type::Any
    info::CallInfo
end
vkws = VirtualKwargs(nothing, Any, NoCallInfo())
newinst = NewInstruction(nothing, Any, NoCallInfo(), nothing, nothing)
runner(newinst, vkws) = NewInstruction(newinst; vkws.stmt, vkws.type, vkws.info)
@benchmark runner($newinst, $vkws)
```

> on master
```
BenchmarkTools.Trial: 10000 samples with 186 evaluations.
 Range (min … max):  559.898 ns …   4.173 μs  ┊ GC (min … max): 0.00% … 85.29%
 Time  (median):     605.608 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   638.170 ns ± 125.080 ns  ┊ GC (mean ± σ):  0.06% ±  0.85%

  █▇▂▆▄  ▁█▇▄▂                                                  ▂
  ██████▅██████▇▇▇██████▇▇▇▆▆▅▄▅▄▂▄▄▅▇▆▆▆▆▆▅▆▆▄▄▅▅▄▃▄▄▄▅▃▅▅▆▅▆▆ █
  560 ns        Histogram: log(frequency) by time       1.23 μs <

 Memory estimate: 32 bytes, allocs estimate: 2.
```

> on this commit
```julia
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range (min … max):  3.080 ns … 83.177 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     3.098 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.118 ns ±  0.885 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

       ▂▅▇█▆▅▄▂
  ▂▄▆▆▇████████▆▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▂▂▁▂▂▂▂▂▂▁▁▂▁▂▂▂▂▂▂▂▂▂ ▃
  3.08 ns        Histogram: frequency by time        3.19 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.
```

So for this particular case it achieves roughly 200x speed up.
This is because this commit allows inlining of a call to keyword sorter
as well as removal of `NamedTuple` call.

Especially this commit is composed of the following improvements:
- add early return case for `structdiff`:
  This change improves the return type inference for a case when
  compared `NamedTuple`s are type unstable but there is no difference
  in their names, e.g. given two `NamedTuple{(:a,:b),T} where T<:Tuple{Any,Any}`s.
  And in such case the optimizer will remove `structdiff` and succeeding
  `pairs` calls, letting the keyword sorter to be inlined.
- add special SROA handling for `NamedTuple` generated by keyword sorter:
  With the change on `structdiff`, IR for a call with type-unstable
  keyword arguments after inlining would look like:
  ```
  %1 = tuple(a, b, c)::Tuple{Any, Any, Any}
  %2 = NamedTuple{(:a, :b, :c)(%1)::NamedTuple{(:a, :b, :c), _A} where _A<:Tuple{Any, Any, Any}
  %3  = Core.getfield(%2, :a)::Any
  %4  = Core.getfield(%2, :b)::Any
  %5  = Core.getfield(%2, :c)::Any
  [... other body of the keyword func ...]
  ```
  We can implement a bit hacky special handling within our SROA pass
  that checks if this definition (%2) is partly well-known `NamedTuple`
  construction, where its names are fully known, and also checks if its
  call argument (%1) is fully-known `tuple` call. In a case when the
  length of the `NamedTuple` names and the length of the arguments for
  the `tuple` call, we can safely replace those `getfield` calls with
  the corresponding `tuple` call argument, while letting the later DCE
  pass to delete the constructions of tuple and named-tuple altogether.

With these changes, the IR for the example `NewInstruction` constructor
is fairly optimized, like:
```julia
julia> Base.code_ircode((NewInstruction,Any,Any,CallInfo)) do newinst, stmt, type, info
           NewInstruction(newinst; stmt, type, info)
       end |> only
2 1 ── %1  = Base.getfield(_2, :line)::Union{Nothing, Int32}                    │╻╷  Type##kw
  │    %2  = Base.getfield(_2, :flag)::Union{Nothing, UInt8}                    ││┃   getproperty
  │    %3  = (isa)(%1, Nothing)::Bool                                           ││
  │    %4  = (isa)(%2, Nothing)::Bool                                           ││
  │    %5  = (Core.Intrinsics.and_int)(%3, %4)::Bool                            ││
  └───       goto #3 if not %5                                                  ││
  2 ── %7  = %new(Main.NewInstruction, _3, _4, _5, nothing, nothing)::NewInstruction   NewInstruction
  └───       goto #10                                                           ││
  3 ── %9  = (isa)(%1, Int32)::Bool                                             ││
  │    %10 = (isa)(%2, Nothing)::Bool                                           ││
  │    %11 = (Core.Intrinsics.and_int)(%9, %10)::Bool                           ││
  └───       goto #5 if not %11                                                 ││
  4 ── %13 = π (%1, Int32)                                                      ││
  │    %14 = %new(Main.NewInstruction, _3, _4, _5, %13, nothing)::NewInstruction│││╻   NewInstruction
  └───       goto #10                                                           ││
  5 ── %16 = (isa)(%1, Nothing)::Bool                                           ││
  │    %17 = (isa)(%2, UInt8)::Bool                                             ││
  │    %18 = (Core.Intrinsics.and_int)(%16, %17)::Bool                          ││
  └───       goto #7 if not %18                                                 ││
  6 ── %20 = π (%2, UInt8)                                                      ││
  │    %21 = %new(Main.NewInstruction, _3, _4, _5, nothing, %20)::NewInstruction│││╻   NewInstruction
  └───       goto #10                                                           ││
  7 ── %23 = (isa)(%1, Int32)::Bool                                             ││
  │    %24 = (isa)(%2, UInt8)::Bool                                             ││
  │    %25 = (Core.Intrinsics.and_int)(%23, %24)::Bool                          ││
  └───       goto #9 if not %25                                                 ││
  8 ── %27 = π (%1, Int32)                                                      ││
  │    %28 = π (%2, UInt8)                                                      ││
  │    %29 = %new(Main.NewInstruction, _3, _4, _5, %27, %28)::NewInstruction    │││╻   NewInstruction
  └───       goto #10                                                           ││
  9 ──       Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{}
  └───       unreachable                                                        ││
  10 ┄ %33 = φ (#2 => %7, #4 => %14, #6 => %21, #8 => %29)::NewInstruction      ││
  └───       goto #11                                                           ││
  11 ─       return %33                                                         │
   => NewInstruction
```
---
 base/compiler/ssair/passes.jl | 39 ++++++++++++++++++++++++++++++++++-
 base/compiler/tfuncs.jl       |  5 +++++
 base/namedtuple.jl            | 11 +++++++---
 test/compiler/inference.jl    |  6 ++++++
 4 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl
index 8ad72285fcaeb3..7932f61e8de8aa 100644
--- a/base/compiler/ssair/passes.jl
+++ b/base/compiler/ssair/passes.jl
@@ -415,6 +415,43 @@ function lift_leaves(compact::IncrementalCompact,
                 return nothing
             else
                 typ = argextype(leaf, compact)
+
+                # Here we implement a bit hacky special case to optimize a function call with
+                # type-unstable but `@nospecialize`-d keyword arguments, whose IR at this point
+                # would look like:
+                # ```
+                # %1 = tuple(a, b, c)::Tuple{Any, Any, Any}
+                # %2 = NamedTuple{(:a, :b, :c)(%1)::NamedTuple{(:a, :b, :c), _A} where _A<:Tuple{Any, Any, Any}
+                # %3  = Core.getfield(%2, :a)::Any
+                # %4  = Core.getfield(%2, :b)::Any
+                # %5  = Core.getfield(%2, :c)::Any
+                # [... other body of the keyword func ...]
+                # ```
+                # First, we check if this definition (%2) is partly well-known
+                # `NamedTuple` construction, where its names are fully known, and also
+                # if its call argument (%1) is fully-known `tuple` call.
+                # In a case when the length of the `NamedTuple` names and the length of
+                # the arguments for the `tuple` call, we can safely replace those `getfield`
+                # calls with the corresponding `tuple` call argument, while letting the
+                # later DCE pass to delete the constructions of tuple and named-tuple
+                typ′ = unwrap_unionall(widenconst(typ))
+                if isa(typ′, DataType) && typ′.name === _NAMEDTUPLE_NAME && length(typ′.parameters) == 2
+                    ns = typ′.parameters[1]
+                    if is_known_call(def, NamedTuple{ns}, compact) && length(def.args) == 2
+                        arg = def.args[2]
+                        if isa(arg, AnySSAValue)
+                            argexpr = compact[arg][:inst]
+                            if is_known_call(argexpr, tuple, compact) && length(ns) == length(argexpr.args)-1
+                                # ok, we know this NamedTuple construction is nothrow,
+                                # let's mark this NamedTuple as DCE-eligible
+                                compact[leaf::AnySSAValue][:flag] |= IR_FLAG_EFFECT_FREE
+                                lift_arg!(compact, argexpr.args[field+1], cache_key, argexpr, field+1, lifted_leaves)
+                                continue
+                            end
+                        end
+                    end
+                end
+
                 if !isa(typ, Const)
                     # TODO: (disabled since #27126)
                     # If the leaf is an old ssa value, insert a getfield here
@@ -469,7 +506,7 @@ function lift_arg!(
         end
     end
     lifted_leaves[cache_key] = LiftedValue(lifted)
-    nothing
+    return nothing
 end
 
 function walk_to_def(compact::IncrementalCompact, @nospecialize(leaf))
diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
index 9c206270976b8d..e6750b8c9000c7 100644
--- a/base/compiler/tfuncs.jl
+++ b/base/compiler/tfuncs.jl
@@ -1594,6 +1594,11 @@ function apply_type_tfunc(@nospecialize(headtypetype), @nospecialize args...)
     end
     if istuple
         return Type{<:appl}
+    elseif isa(appl, DataType) && appl.name === _NAMEDTUPLE_NAME && appl.parameters[1] === ()
+        # if the first parameter of `NamedTuple` is known to be empty tuple,
+        # the second argument should also be empty tuple type,
+        # so refine it here
+        return Const(NamedTuple{(),Tuple{}})
     end
     ans = Type{appl}
     for i = length(outervars):-1:1
diff --git a/base/namedtuple.jl b/base/namedtuple.jl
index 3e9f1272d588e4..b53304de7d8cc4 100644
--- a/base/namedtuple.jl
+++ b/base/namedtuple.jl
@@ -335,7 +335,7 @@ reverse(nt::NamedTuple) = NamedTuple{reverse(keys(nt))}(reverse(values(nt)))
 end
 
 """
-    structdiff(a::NamedTuple{an}, b::Union{NamedTuple{bn},Type{NamedTuple{bn}}}) where {an,bn}
+    structdiff(a::NamedTuple, b::Union{NamedTuple,Type{NamedTuple}})
 
 Construct a copy of named tuple `a`, except with fields that exist in `b` removed.
 `b` can be a named tuple, or a type of the form `NamedTuple{field_names}`.
@@ -343,14 +343,19 @@ Construct a copy of named tuple `a`, except with fields that exist in `b` remove
 function structdiff(a::NamedTuple{an}, b::Union{NamedTuple{bn}, Type{NamedTuple{bn}}}) where {an, bn}
     if @generated
         names = diff_names(an, bn)
+        isempty(names) && return (;) # just a fast pass
         idx = Int[ fieldindex(a, names[n]) for n in 1:length(names) ]
         types = Tuple{Any[ fieldtype(a, idx[n]) for n in 1:length(idx) ]...}
         vals = Any[ :(getfield(a, $(idx[n]))) for n in 1:length(idx) ]
-        :( NamedTuple{$names,$types}(($(vals...),)) )
+        return :( NamedTuple{$names,$types}(($(vals...),)) )
     else
         names = diff_names(an, bn)
+        # N.B this early return is necessary to get a better type stability,
+        # and also allows us to cut off the cost from constructing
+        # potentially type unstable closure passed to the `map` below
+        isempty(names) && return (;)
         types = Tuple{Any[ fieldtype(typeof(a), names[n]) for n in 1:length(names) ]...}
-        NamedTuple{names,types}(map(Fix1(getfield, a), names))
+        return NamedTuple{names,types}(map(n::Symbol->getfield(a, n), names))
     end
 end
 
diff --git a/test/compiler/inference.jl b/test/compiler/inference.jl
index dcd57916589cf4..d6f26ab2f899aa 100644
--- a/test/compiler/inference.jl
+++ b/test/compiler/inference.jl
@@ -2336,6 +2336,12 @@ end
 # Equivalence of Const(T.instance) and T for singleton types
 @test Const(nothing) ⊑ Nothing && Nothing ⊑ Const(nothing)
 
+# `apply_type_tfunc` should always return accurate result for empty NamedTuple case
+import Core: Const
+import Core.Compiler: apply_type_tfunc
+@test apply_type_tfunc(Const(NamedTuple), Const(()), Type{T} where T<:Tuple{}) === Const(typeof((;)))
+@test apply_type_tfunc(Const(NamedTuple), Const(()), Type{T} where T<:Tuple) === Const(typeof((;)))
+
 # Don't pessimize apply_type to anything worse than Type and yield Bottom for invalid Unions
 @test Core.Compiler.return_type(Core.apply_type, Tuple{Type{Union}}) == Type{Union{}}
 @test Core.Compiler.return_type(Core.apply_type, Tuple{Type{Union},Any}) == Type