tensorflow · andr-ec · Feb 18, 2020 · Mar 13, 2020 · Mar 13, 2020 · Mar 17, 2020
diff --git a/Datasets/WMT2014/WMT2014.swift b/Datasets/WMT2014/WMT2014.swift
@@ -0,0 +1,205 @@
+//
+//  File.swift
+//  
+//
+//  Created by Andre Carrera on 3/26/20.
+//
+
+import TensorFlow
+import ModelSupport
+import Foundation
+#if canImport(FoundationNetworking)
+import FoundationNetworking
+#endif
+
+
+let BOS_WORD = "<s>"
+let EOS_WORD = "</s>"
+let BLANK_WORD = "<blank>"
+
+public struct WMT2014EnDe {
+    public let directoryURL: URL
+    public let trainExamples: [Example]
+    public let devExamples: [Example]
+//    public let testExamples: [Example]
+    public let maxSequenceLength: Int
+    public let batchSize: Int
+
+    public typealias ExampleIterator = IndexingIterator<[Example]>
+    public typealias RepeatExampleIterator = ShuffleIterator<RepeatIterator<ExampleIterator>>
+    public typealias TrainDataIterator = PrefetchIterator<
+        GroupedIterator<MapIterator<RepeatExampleIterator, TranslationBatch>>
+    >
+    public typealias DevDataIterator = GroupedIterator<MapIterator<ExampleIterator, TranslationBatch>>
+//    private typealias TestDataIterator = DevDataIterator
+
+    public var trainDataIterator: TrainDataIterator
+    public var devDataIterator: DevDataIterator
+//    private var testDataIterator: TestDataIterator
+}
+
+extension WMT2014EnDe {
+    public struct Example {
+        public let id: String
+        public let sourceSentence: String
+        public let targetSentence: String
+
+        public init(id: String, sourceSentence: String, targetSentence: String) {
+            self.id = id
+            self.sourceSentence = sourceSentence
+            self.targetSentence = targetSentence
+        }
+    }
+
+
+    private static let trainGermanURL = URL(string: "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de")!
+    private static let trainEnglishURL = URL(string: "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en")!
+
+//    internal enum FileType: String {
+//        case train = "train"
+//        case dev = "dev"
+//        case test = "test"
+//    }
+
+    static func load(fromFile fileURL: URL) throws -> [String] {
+        try Data(contentsOf: fileURL).withUnsafeBytes {
+            $0.split(separator: UInt8(ascii: "\n"))
+                .map { String(decoding: UnsafeRawBufferPointer(rebasing: $0), as: UTF8.self) }
+        }
+    }
+}
+
+extension WMT2014EnDe {
+    public init(
+        mapExample: @escaping (Example) -> TranslationBatch,
+        taskDirectoryURL: URL,
+    maxSequenceLength: Int,
+    batchSize: Int) throws {
+        self.maxSequenceLength = maxSequenceLength
+        self.batchSize = batchSize
+
+        self.directoryURL = taskDirectoryURL.appendingPathComponent("Translation")
+        let dataURL = directoryURL.appendingPathComponent("data")
+
+        let trainGermanDataPath = dataURL.appendingPathExtension("source")
+        let trainEnglishDataPath = dataURL.appendingPathExtension("target")
+        print("downloading datasets")
+        try maybeDownload(from: WMT2014EnDe.trainGermanURL, to: trainGermanDataPath)
+        try maybeDownload(from: WMT2014EnDe.trainEnglishURL, to: trainEnglishDataPath)
+        print("loading datasets")
+        let loadedGerman = try WMT2014EnDe.load(fromFile: trainGermanDataPath)
+        let loadedEnglish = try WMT2014EnDe.load(fromFile: trainEnglishDataPath)
+
+        let examples = WMT2014EnDe.combine(sourceSequences: loadedGerman, targetSequences: loadedEnglish)
+        (self.trainExamples, self.devExamples) = WMT2014EnDe.split(examples: examples, with: 0.7)
+        print("creating batches")
+        self.trainDataIterator = trainExamples.shuffled().makeIterator()
+        .repeated()
+        .shuffled(bufferSize: 1000)
+        .map(mapExample)
+            .grouped(keyFn: { $0.tokenIds.scalarCount},
+                     sizeFn: { _ in batchSize / maxSequenceLength},
+                     reduceFn: WMT2014EnDe.reduceDataBatches(_:))
+            .prefetched(count: 2)
+
+        self.devDataIterator = devExamples.makeIterator()
+            .map(mapExample)
+            .grouped(keyFn: {$0.tokenIds.scalarCount},
+                 sizeFn: { _ in batchSize / maxSequenceLength },
+                 reduceFn: WMT2014EnDe.reduceDataBatches(_:))
+    }
+
+    static func reduceDataBatches(_ batches: [TranslationBatch]) -> TranslationBatch {
+        return TranslationBatch(tokenIds: Tensor(batches.map{ $0.tokenIds.squeezingShape(at: 0) }), // this should be fine
+                         targetTokenIds: Tensor(batches.map{ $0.targetTokenIds.squeezingShape(at: 0) }),
+                         mask: Tensor(batches.map{ $0.mask.squeezingShape(at: 0) }),
+                         targetMask: Tensor(batches.map{ $0.targetMask.squeezingShape(at: 0) }),
+                         targetTruth: Tensor(batches.map{ $0.targetTruth.squeezingShape(at: 0) }),
+                         tokenCount: batches.map { $0.tokenCount }.reduce(0, +))
+    }
+
+    static func combine(sourceSequences: [String], targetSequences: [String]) -> [Example] {
+        zip(sourceSequences, targetSequences).enumerated().map { (offset: Int, element: Zip2Sequence<[String], [String]>.Element) -> Example in
+            Example(id: String(offset), sourceSentence: element.0, targetSentence: element.1)
+        }
+    }
+
+    static func split(examples: [Example], with trainPercent: Double) -> (train: [Example], val: [Example]) {
+        let splitIndex = Int(Double(examples.count) * trainPercent)
+        let trainSplit = examples[0..<splitIndex]
+        let valSplit = examples[splitIndex..<examples.count]
+        return (Array(trainSplit), Array(valSplit))
+    }
+}
+
+
+/// Downloads the file at `url` to `path`, if `path` does not exist.
+///
+/// - Parameters:
+///   - from: URL to download data from.
+///   - to: Destination file path.
+///
+/// - Returns: Boolean value indicating whether a download was
+///     performed (as opposed to not needed).
+public func maybeDownload(from url: URL, to destination: URL) throws {
+    if !FileManager.default.fileExists(atPath: destination.path) {
+        // Create any potentially missing directories.
+        try FileManager.default.createDirectory(
+            atPath: destination.deletingLastPathComponent().path,
+            withIntermediateDirectories: true)
+
+        // Create the URL session that will be used to download the dataset.
+        let semaphore = DispatchSemaphore(value: 0)
+        let delegate = DataDownloadDelegate(destinationFileUrl: destination, semaphore: semaphore)
+        let session = URLSession(configuration: .ephemeral, delegate: delegate, delegateQueue: nil)
+
+        // Download the data to a temporary file and then copy that file to
+        // the destination path.
+        print("Downloading \(url).")
+        let task = session.downloadTask(with: url)
+        task.resume()
+
+        // Wait for the download to finish.
+        semaphore.wait()
+    }
+}
+
+internal class DataDownloadDelegate: NSObject, URLSessionDownloadDelegate {
+    let destinationFileUrl: URL
+    let semaphore: DispatchSemaphore
+    let numBytesFrequency: Int64
+
+    internal var logCount: Int64 = 0
+
+    init(
+        destinationFileUrl: URL,
+        semaphore: DispatchSemaphore,
+        numBytesFrequency: Int64 = 1024 * 1024
+    ) {
+        self.destinationFileUrl = destinationFileUrl
+        self.semaphore = semaphore
+        self.numBytesFrequency = numBytesFrequency
+    }
+
+    internal func urlSession(
+        _ session: URLSession,
+        downloadTask: URLSessionDownloadTask,
+        didFinishDownloadingTo location: URL
+    ) -> Void {
+        do {
+            try FileManager.default.moveItem(at: location, to: destinationFileUrl)
+        } catch (let writeError) {
+            print("Error writing file \(location.path) : \(writeError)")
+        }
+        print("Downloaded successfully to \(location.path).")
+        semaphore.signal()
+    }
+}
+
+extension Array {
+    func chunked(into size: Int) -> [[Element]] {
+        return stride(from: 0, to: count, by: size).map {
+            Array(self[$0 ..< Swift.min($0 + size, count)])
+        }
+    }
+}
diff --git a/Examples/Transformer-Translation/WMT2104Training.swift b/Examples/Transformer-Translation/WMT2104Training.swift
@@ -0,0 +1,142 @@
+//
+//  File.swift
+//  
+//
+//  Created by Andre Carrera on 3/26/20.
+//
+
+import Foundation
+import TensorFlow
+import ModelSupport
+import Datasets
+
+public struct TextProcessor {
+    public let tokenizer: Tokenizer
+    public var sourceVocabulary: Vocabulary
+    public var targetVocabulary: Vocabulary
+    private let sourcePadId: Int32
+    private let targetPadId: Int32
+    private let bosId: Int32
+    private let eosId: Int32
+    private let targetUnkId: Int32
+    private let sourceUnkId: Int32
+    private let maxSequenceLength: Int
+    private let batchSize: Int
+    public init(tokenizer: Tokenizer, sourceVocabulary: Vocabulary, targetVocabulary: Vocabulary, maxSequenceLength: Int,
+    batchSize: Int) {
+        self.tokenizer = tokenizer
+        self.sourceVocabulary = sourceVocabulary
+        self.targetVocabulary = targetVocabulary
+        self.maxSequenceLength = maxSequenceLength
+        self.batchSize = batchSize
+
+        self.sourcePadId = Int32(self.sourceVocabulary.id(forToken: BLANK_WORD)!)
+        self.targetPadId = Int32(self.targetVocabulary.id(forToken: BLANK_WORD)!)
+        self.bosId = Int32(self.targetVocabulary.id(forToken: BOS_WORD)!)
+        self.eosId = Int32(self.targetVocabulary.id(forToken: EOS_WORD)!)
+        self.sourceUnkId = Int32(self.sourceVocabulary.id(forToken: UNKNOWN_WORD)!)
+        self.targetUnkId = Int32(self.targetVocabulary.id(forToken: UNKNOWN_WORD)!)
+    }
+    // This will take all source and target sequenes
+    // return batches where each batch is based on the size targets in the sequence.
+//    public mutating func preprocess(source: [String], target:[String], maxSequenceLength: Int, batchSize: Int) -> [TextBatch] {
+//        let sourcePadId = Int32(sourceVocabulary.add(token: BLANK_WORD))
+//        let targetPadId = Int32(targetVocabulary.add(token: BLANK_WORD))
+//        let bosId = Int32(targetVocabulary.add(token: BOS_WORD))
+//        let eosId = Int32(targetVocabulary.add(token: EOS_WORD))
+//
+//        let tokenizedSource = source.map{ src -> [Int32] in
+//            let tokenizedSequence = tokenizer
+//                .tokenize(src)
+//                .prefix(maxSequenceLength)
+//            return tokenizedSequence.map { Int32(self.sourceVocabulary.add(token: $0))}
+//        }
+//        let tokenizedTarget = target.map{ tar -> [Int32] in
+//            let tokenizedSequence = tokenizer
+//                .tokenize(tar)
+//                .prefix(maxSequenceLength)
+//            return [bosId] + tokenizedSequence.map { Int32(self.targetVocabulary.add(token: $0))} + [eosId]
+//        }
+//
+//        let sourceWithTarget = zip(tokenizedSource, tokenizedTarget).map{ $0 }
+//
+//        let groupedBySourceSize = Dictionary(grouping: sourceWithTarget, by: { $0.0.count}).values.flatMap { (group: [([Int32], [Int32])]) -> [TextBatch] in
+//            let batchesFromGroup = group.chunked(into: batchSize)
+//            return batchesFromGroup.map { (batch: [([Int32], [Int32])]) -> TextBatch in
+//                // batch has multiple pairs of sources and targets
+//                let sourceTensor = Tensor(batch.map{ Tensor<Int32>.init($0.0) })
+//                let maxTargetLength = batch.map{ $0.1.count}.max() ?? 0
+//                // pad target length up to largest max.
+//                let targetTensor = Tensor(batch.map{ Tensor<Int32>.init($0.1 + [Int32](repeating: targetPadId, count: (maxTargetLength - $0.1.count))) }) // taraget tensor needs to be padded
+//                let textBatch = TextBatch(source: sourceTensor, target: targetTensor, sourcePadId: sourcePadId, targetPadId: targetPadId)
+//                return textBatch
+//            }
+//        }
+//
+//        return groupedBySourceSize
+//    }
+    /// only pads target sequence to max sequence length,
+    public func preprocess(example: WMT2014EnDe.Example) -> TranslationBatch {
+
+        let encodedSource = self.tokenizer.tokenize(example.sourceSentence)
+            .prefix(self.maxSequenceLength)
+            .map{ Int32(self.sourceVocabulary.id(forToken: $0) ?? Int(self.sourceUnkId))}
+
+        var encodedTarget = self.tokenizer.tokenize(example.targetSentence)
+            .prefix(self.maxSequenceLength - 2)
+            .map{ Int32(self.targetVocabulary.id(forToken: $0) ?? Int(self.targetUnkId))}
+        encodedTarget = [bosId] + encodedTarget + [eosId]
+        let paddingCount = encodedTarget.count < maxSequenceLength ? maxSequenceLength - encodedTarget.count : 0
+        let padding = [Int32](repeating: targetPadId, count: paddingCount)
+        encodedTarget = encodedTarget + padding
+        assert(encodedTarget.count == maxSequenceLength, "encodedTarget.count \(encodedTarget.count) does not equal maxSequenceLength \(maxSequenceLength)")
+
+        let sourceTensor = Tensor<Int32>.init(encodedSource).expandingShape(at: 0)
+
+        // add padding to target since it will be grouped by sourcelength
+
+        // padding is going to be equal to the difference between maxSequence length and the totalEncod
+        let targetTensor = Tensor<Int32>.init( encodedTarget).expandingShape(at: 0)
+        let singleBatch = TranslationBatch(source: sourceTensor, target: targetTensor, sourcePadId: sourcePadId, targetPadId: targetPadId)
+
+//        print("original source:", example.sourceSentence)
+//        print("decoded source:", decode(tensor: singleBatch.tokenIds, vocab: sourceVocabulary))
+//
+//        print("max len = \(maxSequenceLength)")
+//        print("encoded target \(encodedTarget.count) last: \(encodedTarget.last!)")
+//        print("original target:", example.targetSentence)
+//        print("decoded target:", decode(tensor: singleBatch.targetTokenIds, vocab: targetVocabulary))
+//        print("decoded truth:", decode(tensor: singleBatch.targetTruth, vocab: targetVocabulary))
+        return singleBatch
+    }
+
+}
+
+func decode(tensor: Tensor<Int32>, vocab: Vocabulary) -> String {
+  let endId = Int32(vocab.id(forToken: "</s>")!)
+   var words = [String]()
+   for scalar in tensor.scalars {
+       if Int(scalar) == endId {
+           break
+       } else
+        if let token = vocab.token(forId: Int(scalar)) {
+           words.append(token)
+       }
+   }
+   return words.joined(separator: " ")
+}
+
+extension Vocabulary {
+
+    public init(fromFile fileURL: URL, specialTokens: [String]) throws {
+        let vocabItems = try ( String(contentsOfFile: fileURL.path, encoding: .utf8))
+        .components(separatedBy: .newlines)
+        .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
+        let dictionary = [String: Int](
+                (specialTokens + vocabItems)
+                .filter { $0.count > 0 }
+                .enumerated().map { ($0.element, $0.offset) },
+            uniquingKeysWith: { (v1, v2) in max(v1, v2) })
+        self.init(tokensToIds: dictionary )
+    }
+}