Skip to content

Commit

Permalink
Add support for 32-bit hashing of strings stored in Unsigned values (#13
Browse files Browse the repository at this point in the history
)

* Add support for 32-bit hashing of strings stored in Unsigned values

* Update tests
  • Loading branch information
ScottPJones authored May 6, 2021
1 parent a7beaab commit e1974e5
Show file tree
Hide file tree
Showing 4 changed files with 224 additions and 45 deletions.
43 changes: 43 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: CI
on:
- push
- pull_request
jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
version:
- '1.5'
- '1.6'
- 'nightly'
os:
- ubuntu-latest
- macOS-latest
- windows-latest
arch:
- x64
- x86
exclude:
- os: macOS-latest
arch: x86
steps:
- uses: actions/checkout@v2
- uses: julia-actions/setup-julia@v1
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: actions/cache@v1
env:
cache-name: cache-artifacts
with:
path: ~/.julia/artifacts
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
restore-keys: |
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ keywords = ["Strings", "Hashing"]
license = "MIT"
name = "MurmurHash3"
uuid = "b10b62ed-fbae-5ea5-b934-abaf0477b71d"
version = "1.1.1"
version = "1.2.0"

[deps]

Expand Down
144 changes: 118 additions & 26 deletions src/MurmurHash3.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ const c2 = 0x4cf5ad432745937f
@inline mhtail2(h2, k2) = xor(h2, rotl33(k2 * c2) * c1)

@inline function mhblock(h1, h2, k1, k2)
dbf[] && print("mhblock($(repr(h1)), $(repr(h2)), $(repr(k1)), $(repr(k2))) => ")
# dbf[] && print("mhblock($(repr(h1)), $(repr(h2)), $(repr(k1)), $(repr(k2))) => ")
h1 = (rotl27(mhtail1(h1, k1)) + h2) * 5 + 0x52dce729
h2 = (rotl31(mhtail2(h2, k2)) + h1) * 5 + 0x38495ab5
dbf[] && println(repr(h1), ", ", repr(h2))
# dbf[] && println(repr(h1), ", ", repr(h2))
h1, h2
end

Expand All @@ -59,7 +59,6 @@ end
end

@inline function mhfin(len, h1, h2)
dbf[] && print("mhfin($len, $(repr(h1)), $(repr(h2))) => ")
h1 = xor(h1, u64(len))
h2 = xor(h2, u64(len))

Expand All @@ -70,7 +69,6 @@ end
h2 = fmix(h2)

h1 += h2
dbf[] && println(repr(h1), ", ", repr(h1 + h2))
h1, h1 + h2
end

Expand Down Expand Up @@ -182,7 +180,6 @@ end

@inline function add_utf8(cnt, chr, k1::UInt128)
ch = u32(chr)
dbf[] && println("add_utf($cnt, $(repr(ch)), $(repr(k1))")
if ch <= 0x7f
cnt + 1, k1 | shift_n(ch, cnt)
elseif ch <= 0x7ff
Expand All @@ -197,7 +194,7 @@ end
@inline function add_utf8_split(cnt, chr, k1::UInt128)
ch = u32(chr)
ch <= 0x7f && return (cnt + 1, k1 | shift_n(ch, cnt), u64(0))
dbf[] && print("add_utf_split($cnt, $(repr(ch)), $(repr(k1))")
# dbf[] && print("add_utf_split($cnt, $(repr(ch)), $(repr(k1))")
if ch <= 0x7ff
nc = cnt + 2
v = get_utf8_2(ch)
Expand All @@ -212,12 +209,12 @@ end
# This will always go over, may be 1, 2, 3 bytes in second word
nc = cnt + 4
v = get_utf8_4(ch)
dbf[] && println(" : cnt=$cnt, v=$(repr(v))")
# dbf[] && println(" : cnt=$cnt, v=$(repr(v))")
v1, v2 = cnt == 13 ? (up13b(v), u64(v) >>> 24) :
cnt == 14 ? (up14b(v), u64(v) >>> 16) :
(up15b(v), u64(v) >>> 8)
end
dbf[] && println(" -> ($nc, $(repr(v1)) => $(repr(k1|v1)), $(repr(v2)))")
# dbf[] && println(" -> ($nc, $(repr(v1)) => $(repr(k1|v1)), $(repr(v2)))")
return (nc, k1 | v1, v2)
end

Expand Down Expand Up @@ -292,9 +289,9 @@ end
function mmhash128_8_u(len::Integer, unaligned_pnt::Ptr, seed::UInt32)
# Should optimize handling of short (< 16 byte) unaligned strings
ulp = reinterpret(UInt, unaligned_pnt)
pnt = reinterpret(Ptr{UInt64}, ulp & ~u64(7))
fin = reinterpret(Ptr{UInt64}, (ulp + len + 0x7) & ~u64(7)) - 8
shft = (ulp & u64(7))<<3
pnt = reinterpret(Ptr{UInt64}, ulp & ~UInt(7))
fin = reinterpret(Ptr{UInt64}, (ulp + len + 0x7) & ~UInt(7)) - 8
shft = (ulp & UInt(7))<<3
h1 = h2 = u64(seed)
k1 = unsafe_load(pnt) # Pick up first 1-7 bytes
k2 = u64(0)
Expand All @@ -313,6 +310,8 @@ end

#----------------------------------------------------------------------------

# 32-bit MurmurHash3 (see MurmurHash3_x86_32)

@inline xor16(k::UInt32) = xor(k, k >>> 16)
@inline xor13(k::UInt32) = xor(k, k >>> 13)

Expand Down Expand Up @@ -340,14 +339,40 @@ const d2 = 0x1b873593
pnt, h1
end

function mmhash32(len, pnt, seed::UInt32)
pnt, h1 = mhbody(len >>> 2, reinterpret(Ptr{UInt32}, pnt), seed)
@inline mhtail32(h, v) = xor(h, rotl15(v * d1) * d2)
@inline mask32(v, res) = v & ifelse(res==1, 0x000ff, ifelse(res==2, 0x0ffff, 0xffffff))

@inline function calc32(len, pnt::Ptr, seed)
res = len & 3
res != 0 && (seed = mhtail32(seed, mask32(unsafe_load(pnt), res)))
fmix(xor(seed, u32(len)))
end

@inline function calc32(len, val::UInt32, seed)
res = len & 3
if res != 0
v = unsafe_load(pnt) & ifelse(res==1, 0x000ff, ifelse(res==2, 0x0ffff, 0xffffff))
h1 = xor(h1, rotl15(v * d1) * d2)
res != 0 && (seed = mhtail32(seed, mask32(val, res)))
fmix(xor(seed, u32(len)))
end


mmhash32(len, pnt::Ptr, seed::UInt32) =
calc32(len, mhbody(len >>> 2, reinterpret(Ptr{UInt32}, pnt), seed)...)

# length must be 0-3
mmhash32(len, val::UInt32, seed::UInt32) = calc32(len, val, seed)

# length must be 0-7
mmhash32(len, val::UInt64, seed::UInt32) =
(len > 3
? calc32(len, u32(val>>>32), mhblock(seed, u32(val)))
: calc32(len, u32(val), seed))

function mmhash32(len, val::Unsigned, seed::UInt32)
for i = 1:(len>>>2)
seed = mhblock(seed, u32(val))
val >>>= 32
end
fmix(xor(h1, u32(len)))
calc32(len, u32(val), seed)
end

@inline function mhfin(len, h1, h2, h3, h4)
Expand Down Expand Up @@ -378,11 +403,18 @@ const e2 = 0xab0e9789
const e3 = 0x38b34ae5
const e4 = 0xa1e38b93

mhtail4_1(h, v) = xor(h, rotl15(v * e1) * e2)
mhtail4_2(h, v) = xor(h, rotl16(v * e2) * e3)
mhtail4_3(h, v) = xor(h, rotl17(v * e3) * e4)
mhtail4_4(h, v) = xor(h, rotl18(v * e4) * e1)

@inline mask_v32(val, left) = u32(val) & ((UInt32(1) << ((left & 3) << 3)) - 0x1)

@inline function mhblock(h1, h2, h3, h4, k1, k2, k3, k4)
h1 = (rotl19(xor(h1, rotl15(k1 * e1) * e2)) + h2)*5 + 0x561ccd1b
h2 = (rotl17(xor(h2, rotl16(k2 * e2) * e3)) + h3)*5 + 0x0bcaa747
h3 = (rotl15(xor(h3, rotl17(k3 * e3) * e4)) + h4)*5 + 0x96cd1c35
h4 = (rotl13(xor(h4, rotl18(k4 * e4) * e1)) + h1)*5 + 0x32ac3b17
h1 = (rotl19(mhtail4_1(h1, k1)) + h2)*5 + 0x561ccd1b
h2 = (rotl17(mhtail4_2(h2, k2)) + h3)*5 + 0x0bcaa747
h3 = (rotl15(mhtail4_3(h3, k3)) + h4)*5 + 0x96cd1c35
h4 = (rotl13(mhtail4_4(h4, k4)) + h1)*5 + 0x32ac3b17
h1, h2, h3, h4
end

Expand All @@ -402,21 +434,81 @@ function mmhash128_4(seed::UInt32)
up32(h) | fmix(4*seed)*4, up32(h) | h
end

function mmhash128_4(len, pnt, seed::UInt32)
# For val that is stored in up to 32 bits, we handle it without any loops
# len can only be 1-3 in this case
function mmhash128_4(len::Integer, val::UInt32, seed::UInt32)
len == 0 && return mmhash128_4(seed)
mhfin(len, mhtail4_1(seed, mask_v32(val, len)), seed, seed, seed)
end

# For val that is stored in up to 64 bits, we handle it without any loops
# len can only be 1-7 in this case
function mmhash128_4(len::Integer, val::UInt64, seed::UInt32)
len == 0 && return mmhash128_4(seed)
mhfin(len, mhtail4_1(seed, len < 4 ? mask_v32(val, len) : u32(val)),
len > 4 ? mhtail4_2(seed, mask_v32(val>>>32, len)) : seed,
seed, seed)
end

# For val that is stored in up to 128 bits, we handle it without any loops
# len can only be 1-15 in this case
function mmhash128_4(len::Integer, val::UInt128, seed::UInt32)
len == 0 && return mmhash128_4(seed)
h2 = h3 = h4 = seed
if len > 4
val >>>= 32
h2 = mhtail4_2(h2, len < 8 ? mask_v32(val, len) : u32(val))
if len > 8
val >>>= 32
h3 = mhtail4_3(h3, len < 12 ? mask_v32(val, len) : u32(val))
len > 12 && (h4 = mhtail4_4(h4, mask_v32(val>>>32, len)))
end
end
mhfin(len, mhtail4_1(seed, len < 4 ? mask_v32(val, len) : u32(val)), h2, h3, h4)
end

function mmhash128_4(len::Integer, val::Unsigned, seed::UInt32)
h1 = h2 = h3 = h4 = seed
for i = 1:(len>>>4)
h1, h2, h3, h4 =
mhblock(h1, h2, h3, h4, u32(val), u32(val>>>32), u32(val>>>64), u32(val>>>96))
val >>>= 128
end
if (left = len & 15) != 0
# Pick up 32-bit
h1 = mhtail4_1(h1, left < 4 ? mask_v32(val, left) : u32(val))
if left > 4
val >>>= 32
h2 = mhtail4_2(h2, left < 8 ? mask_v32(val, left) : u32(val))
if left > 8
val >>>= 32
h3 = mhtail4_3(h3, left < 12 ? mask_v32(val, left) : u32(val))
left > 12 && (h4 = mhtail4_4(h4, mask_v32(val>>>32, left)))
end
end
end
mhfin(len, h1, h2, h3, h4)
end

function mmhash128_4(len::Integer, pnt::Ptr{UInt32}, seed::UInt32)
pnt, h1, h2, h3, h4 = mhbody(len >>> 4, pnt, seed, seed, seed, seed)
if (left = len & 15) != 0
h1 = xor(h1, rotl16(unsafe_load(pnt) * e1) * e2)
h1 = mhtail4_1(h1, unsafe_load(pnt))
if left > 4
h2 = xor(h2, rotl16(unsafe_load(pnt+4) * e2) * e3)
h2 = mhtail4_2(h2, unsafe_load(pnt+4))
if left > 8
h3 = xor(h3, rotl17(unsafe_load(pnt+8) * e3) * e4)
left > 12 && (h4 = xor(h4, rotl18(unsafe_load(pnt+12) * e4) * e1))
h3 = mhtail4_3(h3, unsafe_load(pnt+8))
left > 12 && (h4 = mhtail4_4(h4, unsafe_load(pnt+12)))
end
end
end
mhfin(len, h1, h2, h3, h4)
end

mmhash128_4(len::Integer, pnt::Ptr, seed::UInt32) =
mmhash128_4(len, reinterpret(Ptr{UInt32}, pnt), seed)

# Handle value stored in an unsigned value
import Base.GC: @preserve

# AbstractString MurmurHash3, converts to UTF-8 on the fly (not optimized yet!)
Expand Down
80 changes: 62 additions & 18 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,80 @@
# License is MIT: LICENSE.md

using MurmurHash3
using MurmurHash3: mmhash128_4, mmhash128_8_c, mmhash128_8_a, mmhash128_8_u

using Test

p1 = SubString("--hello--",3,7)
p2 = "hello"

_memhash(siz, ptr) = ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), ptr, siz, 0%UInt32)
mh(str::String) = _memhash(sizeof(str), pointer(str))
mh(str::AbstractString) = mh(string(str))
const MaybeSub = Union{String,SubString{String}}
const mhseed32 = 0x56419c81
const mhseed64 = 0x71e729fd56419c81

mmhash(str::String) = mmhash128_a(sizeof(str), pointer(str), 0%UInt32)
mmhashc(str::AbstractString) = mmhash128_c(str, 0%UInt32)
# 32-bit MurmurHash (calling C version in Base)
mh32c(siz::Int, ptr::Ptr{UInt8}, seed=0%UInt32) =
ccall(:memhash32_seed, UInt32, (Ptr{UInt8}, Csize_t, UInt32), ptr, siz, seed)
mh32c(str::MaybeSub, seed=0%UInt32) = mh32c(sizeof(str), pointer(str), seed)

mh32(str) = mmhash32(sizeof(str), pointer(str), 0%UInt32)
# Calling Julia version in MurmurHash.jl
mh32j(str::MaybeSub, seed=0%UInt32) = mmhash32(sizeof(str), pointer(str), seed)
mh32j(len::Integer, val::Unsigned, seed=0%UInt32) = mmhash32(len, val, seed)

mmhashu(len, val::Unsigned) = mmhash128_a(len, val, 0%UInt32)
# 128-bit MurmurHash (either 32-bit or 64-bit implementation, C version in Base), lower 64-bits
mh128c(siz::Int, ptr::Ptr{UInt8}, seed=0) =
ccall(:memhash_seed, UInt64, (Ptr{UInt8}, Csize_t, UInt32), ptr, siz, seed)
mh128c(s::MaybeSub, h=0) = mh128c(sizeof(s), pointer(s), h%UInt32)

mh128j4(s::MaybeSub, h=0) = mmhash128_4(sizeof(s), pointer(s), h%UInt32)
mh128j8a(s::MaybeSub, h=0) = mmhash128_8_a(sizeof(s), pointer(s), h%UInt32)
mh128j8u(s::MaybeSub, h=0) = mmhash128_8_u(sizeof(s), pointer(s), h%UInt32)
mh128j8c(s::MaybeSub, h=0) = mmhash128_8_c(s, h%UInt32)

# 128-bit MurmurHash (Julia version)
mh128j(str::MaybeSub, seed=0) = mmhash128_a(sizeof(str), pointer(str), seed%UInt32)
mh128j(len::Int, val::Unsigned, seed=0) = mmhash128_a(len, val, seed%UInt32)
mh128j_c(str::AbstractString, seed=0) = mmhash128_c(str, seed%UInt32)

load_u64(p) = unsafe_load(reinterpret(Ptr{UInt64}, pointer(p1)))

const sizp2 = sizeof(p2)
const unsp2 = load_u64(p2)

pd(msg, v) = (print(stderr, msg); dump(stderr, v))

pd("32c: ", mh32c(p2))
pd("32j: ", mh32j(p2))
pd("32u: ", mh32j(sizp2, unsp2))
pd("128c: ", mh128c(p2))
pd("128j: ", mh128j(p2))
pd("128u: ", mh128j(sizp2, unsp2))
pd("128j4: ", mh128j4(p2))
pd("128ja: ", mh128j8a(p2))
pd("128ju: ", mh128j8u(p2))
pd("128jc: ", mh128j8c(p2))

@testset "MurmurHash3" begin
@test mmhashc(p1) == mmhash(p2)
@static if sizeof(Int) == 8
@test last(mmhashu(sizeof(p1), load_u64(p1))) == mh(p1)
@test last(mmhashu(sizeof(p2), load_u64(p2))) == mh(p2)
@test last(mmhashc(p1)) == mh(p1)
@test last(mmhashc(p2)) == mh(p2)
@test last(mmhash(p2)) == mh(p1)
else
@test mh32(p2) == mh(p2)
@testset "32-bit MurmurHash" begin
@test mh32j(p1) == mh32c(p1)
@test mh32j(p2) == mh32c(p2)
@test mh32j(sizp2, unsp2) == mh32c(p2)
end
@static if UInt === UInt64
@testset "Aligned vs unaligned" begin
@test mh128j_c(p1) == mh128j(p2)
end
@testset "128-bit MurmurHash" begin
@test last(mh128j(sizp2, unsp2)) == mh128c(p1)
@static if UInt === UInt64
@test last(mh128j(p1)) == mh128c(p1)
@test last(mh128j_c(p2)) == mh128c(p2)
@test last(mh128j_c(p2)) == mh128c(p1)
else
@test_broken last(mh128j(p1)) == mh128c(p1)
@test_broken last(mh128j_c(p2)) == mh128c(p2)
@test_broken last(mh128j_c(p2)) == mh128c(p1)
end
end
end
end


2 comments on commit e1974e5

@ScottPJones
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error while trying to register: Version 1.2.0 already exists

Please sign in to comment.