Skip to content

Commit

Permalink
ff/gl64_t.cuh: refactor reduce().
Browse files Browse the repository at this point in the history
Better readability and marginal improvement in exponentiation.
  • Loading branch information
dot-asm committed Jan 2, 2024
1 parent c536f09 commit e6089ae
Showing 1 changed file with 15 additions and 32 deletions.
47 changes: 15 additions & 32 deletions ff/gl64_t.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -267,45 +267,28 @@ private:

inline void reduce(uint32_t temp[4])
{
uint32_t carry;
# if __CUDA_ARCH__ >= 700
asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;"
: "+r"(temp[0]), "+r"(temp[1]), "=r"(carry)
: "r"(temp[2]), "r"(temp[3]));
asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;"
: "+r"(temp[1]), "+r"(carry)
: "r"(temp[2]), "r"(temp[3]));

asm("mad.lo.cc.u32 %0, %3, %4, %0; madc.hi.cc.u32 %1, %3, %4, %1; addc.u32 %2, 0, 0;"
: "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2])
: "r"(carry), "r"(gl64_device::W));
asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
: "+r"(temp[0]), "+r"(temp[1])
: "r"(temp[2]), "r"(gl64_device::W));
asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1; addc.u32 %2, 0, 0;"
: "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2])
: "r"(gl64_device::W));
# else
uint32_t b0, b1;
asm("add.cc.u32 %0, %2, %3; addc.u32 %1, 0, 0;"

asm("sub.cc.u32 %0, 0, %2; subc.u32 %1, %2, 0;"
: "=r"(b0), "=r"(b1)
: "r"(temp[2]), "r"(temp[3]));
asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, %4; subc.u32 %2, 0, 0;"
: "+r"(temp[0]), "+r"(temp[1]), "=r"(carry)
: "r"(temp[2]));
asm("add.cc.u32 %0, %0, %3; addc.cc.u32 %1, %1, %4; addc.u32 %2, 0, 0;"
: "+r"(temp[0]), "+r"(temp[1]), "=r"(temp[2])
: "r"(b0), "r"(b1));
asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, %3;"
: "+r"(temp[0]), "+r"(temp[1])
: "r"(-carry), "r"(carry));
asm("add.cc.u32 %0, %0, %1; addc.u32 %1, 0, 0;"
: "+r"(temp[1]), "+r"(temp[2]));
# endif
asm("sub.cc.u32 %0, %0, %3; subc.cc.u32 %1, %1, 0; subc.u32 %2, %2, 0;"
: "+r"(temp[0]), "+r"(temp[1]), "+r"(temp[2])
: "r"(temp[3]));

# if __CUDA_ARCH__ >= 700
asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.u32 %1, %2, %3, %1;"
asm("sub.cc.u32 %0, %0, %2; subc.u32 %1, %1, %3;"
: "+r"(temp[0]), "+r"(temp[1])
: "r"(temp[2]), "r"(gl64_device::W));
# else
asm("add.cc.u32 %0, %0, %2; addc.u32 %1, %1, 0;"
: "+r"(temp[0]), "+r"(temp[1])
: "r"(-temp[2]));
# endif
# endif
: "r"(temp[2]), "r"(-(int)temp[2]>>1));

asm("mov.b64 %0, {%1, %2};" : "=l"(val) : "r"(temp[0]), "r"(temp[1]));
}

Expand Down

0 comments on commit e6089ae

Please sign in to comment.