Skip to content

Commit

Permalink
Swift API for speaker diarization (#1404)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Oct 9, 2024
1 parent df681e9 commit 1571344
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .github/scripts/test-swift.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ echo "pwd: $PWD"
cd swift-api-examples
ls -lh

./run-speaker-diarization.sh
rm -rf *.onnx
rm -rf sherpa-onnx-pyannote-segmentation-3-0
rm -fv *.wav

./run-add-punctuations.sh
rm ./add-punctuations
rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12
Expand Down
113 changes: 113 additions & 0 deletions swift-api-examples/SherpaOnnx.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1078,3 +1078,116 @@ class SherpaOnnxOfflinePunctuationWrapper {
return ans
}
}

func sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: String)
-> SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig
{
return SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: toCPointer(model))
}

func sherpaOnnxOfflineSpeakerSegmentationModelConfig(
pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig,
numThreads: Int = 1,
debug: Int = 0,
provider: String = "cpu"
) -> SherpaOnnxOfflineSpeakerSegmentationModelConfig {
return SherpaOnnxOfflineSpeakerSegmentationModelConfig(
pyannote: pyannote,
num_threads: Int32(numThreads),
debug: Int32(debug),
provider: toCPointer(provider)
)
}

func sherpaOnnxFastClusteringConfig(numClusters: Int = -1, threshold: Float = 0.5)
-> SherpaOnnxFastClusteringConfig
{
return SherpaOnnxFastClusteringConfig(num_clusters: Int32(numClusters), threshold: threshold)
}

func sherpaOnnxSpeakerEmbeddingExtractorConfig(
model: String,
numThreads: Int = 1,
debug: Int = 0,
provider: String = "cpu"
) -> SherpaOnnxSpeakerEmbeddingExtractorConfig {
return SherpaOnnxSpeakerEmbeddingExtractorConfig(
model: toCPointer(model),
num_threads: Int32(numThreads),
debug: Int32(debug),
provider: toCPointer(provider)
)
}

func sherpaOnnxOfflineSpeakerDiarizationConfig(
segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig,
embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig,
clustering: SherpaOnnxFastClusteringConfig,
minDurationOn: Float = 0.3,
minDurationOff: Float = 0.5
) -> SherpaOnnxOfflineSpeakerDiarizationConfig {
return SherpaOnnxOfflineSpeakerDiarizationConfig(
segmentation: segmentation,
embedding: embedding,
clustering: clustering,
min_duration_on: minDurationOn,
min_duration_off: minDurationOff
)
}

struct SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper {
var start: Float = 0
var end: Float = 0
var speaker: Int = 0
}

class SherpaOnnxOfflineSpeakerDiarizationWrapper {
/// A pointer to the underlying counterpart in C
let impl: OpaquePointer!

init(
config: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationConfig>!
) {
impl = SherpaOnnxCreateOfflineSpeakerDiarization(config)
}

deinit {
if let impl {
SherpaOnnxDestroyOfflineSpeakerDiarization(impl)
}
}

var sampleRate: Int {
return Int(SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(impl))
}

func process(samples: [Float]) -> [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] {
let result = SherpaOnnxOfflineSpeakerDiarizationProcess(
impl, samples, Int32(samples.count))

if result == nil {
return []
}

let numSegments = Int(SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result))

let p: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationSegment>? =
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result)

if p == nil {
return []
}

var ans: [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] = []
for i in 0..<numSegments {
ans.append(
SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper(
start: p![i].start, end: p![i].end, speaker: Int(p![i].speaker)))
}

SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p)
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result)

return ans
}
}
35 changes: 35 additions & 0 deletions swift-api-examples/run-speaker-diarization.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash

if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
fi

if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
fi

if [ ! -f ./0-four-speakers-zh.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
fi

if [ ! -e ./speaker-diarization ]; then
# Note: We use -lc++ to link against libc++ instead of libstdc++
swiftc \
-lc++ \
-I ../build-swift-macos/install/include \
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
./speaker-diarization.swift ./SherpaOnnx.swift \
-L ../build-swift-macos/install/lib/ \
-l sherpa-onnx \
-l onnxruntime \
-o speaker-diarization

strip speaker-diarization
else
echo "./speaker-diarization exists - skip building"
fi

export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
./speaker-diarization
56 changes: 56 additions & 0 deletions swift-api-examples/speaker-diarization.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import AVFoundation

extension AudioBuffer {
func array() -> [Float] {
return Array(UnsafeBufferPointer(self))
}
}

extension AVAudioPCMBuffer {
func array() -> [Float] {
return self.audioBufferList.pointee.mBuffers.array()
}
}

func run() {
let segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"
let embeddingExtractorModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"
let waveFilename = "./0-four-speakers-zh.wav"

// There are 4 speakers in ./0-four-speakers-zh.wav, so we use 4 here
let numSpeakers = 4
var config = sherpaOnnxOfflineSpeakerDiarizationConfig(
segmentation: sherpaOnnxOfflineSpeakerSegmentationModelConfig(
pyannote: sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: segmentationModel)),
embedding: sherpaOnnxSpeakerEmbeddingExtractorConfig(model: embeddingExtractorModel),
clustering: sherpaOnnxFastClusteringConfig(numClusters: numSpeakers)
)

let sd = SherpaOnnxOfflineSpeakerDiarizationWrapper(config: &config)

let fileURL: NSURL = NSURL(fileURLWithPath: waveFilename)
let audioFile = try! AVAudioFile(forReading: fileURL as URL)

let audioFormat = audioFile.processingFormat
assert(Int(audioFormat.sampleRate) == sd.sampleRate)
assert(audioFormat.channelCount == 1)
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)

let audioFrameCount = UInt32(audioFile.length)
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)

try! audioFile.read(into: audioFileBuffer!)
let array: [Float]! = audioFileBuffer?.array()
print("Started!")
let segments = sd.process(samples: array)
for i in 0..<segments.count {
print("\(segments[i].start) -- \(segments[i].end) speaker_\(segments[i].speaker)")
}
}

@main
struct App {
static func main() {
run()
}
}

0 comments on commit 1571344

Please sign in to comment.