Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add Attention is All you need transformer and Translation example #422

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
0ddb147
added base translation model and package
andr-ec Feb 18, 2020
847480b
WIP for tokenizers
andr-ec Mar 13, 2020
b1a20dd
added full text preprocessing and started creating training loop
andr-ec Mar 13, 2020
4d78760
working attention
andr-ec Mar 17, 2020
7793186
working model on forward pass
andr-ec Mar 17, 2020
7950233
working forwards pass
andr-ec Mar 18, 2020
046b55e
cleaning up code
andr-ec Mar 18, 2020
7488aed
comments
andr-ec Mar 18, 2020
12f3975
working training loop
andr-ec Mar 20, 2020
398a416
updated training step
andr-ec Mar 20, 2020
101a444
to gpu
andr-ec Mar 20, 2020
cac0e88
removed python import
andr-ec Mar 20, 2020
77554c5
added foundation import
andr-ec Mar 20, 2020
312bfda
fixed import in wrong file
andr-ec Mar 20, 2020
e7d38bb
reduced batch size
andr-ec Mar 20, 2020
4345737
added package to allow import of translation models
andr-ec Mar 20, 2020
6e0930d
updated batch and sequence length to defualts
andr-ec Mar 20, 2020
958531e
updated learning rate to that in paper
andr-ec Mar 20, 2020
444f53d
made required methods public, fixed vocab to lookup correct values
andr-ec Mar 20, 2020
d959992
added requirements for greedy decoding
andr-ec Mar 20, 2020
0f9039e
working greedy decoding, working ignoreIndex for padding, training lo…
andr-ec Mar 25, 2020
d6e1a57
moved custom crossentropy to utilities
andr-ec Mar 26, 2020
819e1ec
made softmax public
andr-ec Mar 26, 2020
3c3b674
cleaned up comments and code organization
andr-ec Mar 26, 2020
f7ba238
formatting
andr-ec Mar 26, 2020
a2c6787
added validation loop
andr-ec Mar 26, 2020
9234b64
reformatted to use dataset helpers, much more effecient with memory a…
andr-ec Mar 27, 2020
ee0e3dd
organized project structure and started using existing vocab
andr-ec Mar 27, 2020
0366a69
fix vocabulary loading and imports
andr-ec Mar 27, 2020
456bece
moved extensions, added <unk> token, added decode function
andr-ec Mar 27, 2020
c92bbdd
fixing encoding
andr-ec Mar 27, 2020
1250800
added initialization to many params
andr-ec Mar 27, 2020
0f018bc
fixing initializations
andr-ec Mar 27, 2020
2731c83
added init to activations
andr-ec Mar 27, 2020
9ac1672
removed init from attention
andr-ec Mar 27, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions Datasets/WMT2014/WMT2014.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
//
// File.swift
//
//
// Created by Andre Carrera on 3/26/20.
//

import TensorFlow
import ModelSupport
import Foundation
#if canImport(FoundationNetworking)
import FoundationNetworking
#endif


let BOS_WORD = "<s>"
let EOS_WORD = "</s>"
let BLANK_WORD = "<blank>"

public struct WMT2014EnDe {
public let directoryURL: URL
public let trainExamples: [Example]
public let devExamples: [Example]
// public let testExamples: [Example]
public let maxSequenceLength: Int
public let batchSize: Int

public typealias ExampleIterator = IndexingIterator<[Example]>
public typealias RepeatExampleIterator = ShuffleIterator<RepeatIterator<ExampleIterator>>
public typealias TrainDataIterator = PrefetchIterator<
GroupedIterator<MapIterator<RepeatExampleIterator, TranslationBatch>>
>
public typealias DevDataIterator = GroupedIterator<MapIterator<ExampleIterator, TranslationBatch>>
// private typealias TestDataIterator = DevDataIterator

public var trainDataIterator: TrainDataIterator
public var devDataIterator: DevDataIterator
// private var testDataIterator: TestDataIterator
}

extension WMT2014EnDe {
public struct Example {
public let id: String
public let sourceSentence: String
public let targetSentence: String

public init(id: String, sourceSentence: String, targetSentence: String) {
self.id = id
self.sourceSentence = sourceSentence
self.targetSentence = targetSentence
}
}


private static let trainGermanURL = URL(string: "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de")!
private static let trainEnglishURL = URL(string: "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en")!

// internal enum FileType: String {
// case train = "train"
// case dev = "dev"
// case test = "test"
// }

static func load(fromFile fileURL: URL) throws -> [String] {
try Data(contentsOf: fileURL).withUnsafeBytes {
$0.split(separator: UInt8(ascii: "\n"))
.map { String(decoding: UnsafeRawBufferPointer(rebasing: $0), as: UTF8.self) }
}
}
}

extension WMT2014EnDe {
public init(
mapExample: @escaping (Example) -> TranslationBatch,
taskDirectoryURL: URL,
maxSequenceLength: Int,
batchSize: Int) throws {
self.maxSequenceLength = maxSequenceLength
self.batchSize = batchSize

self.directoryURL = taskDirectoryURL.appendingPathComponent("Translation")
let dataURL = directoryURL.appendingPathComponent("data")

let trainGermanDataPath = dataURL.appendingPathExtension("source")
let trainEnglishDataPath = dataURL.appendingPathExtension("target")
print("downloading datasets")
try maybeDownload(from: WMT2014EnDe.trainGermanURL, to: trainGermanDataPath)
try maybeDownload(from: WMT2014EnDe.trainEnglishURL, to: trainEnglishDataPath)
print("loading datasets")
let loadedGerman = try WMT2014EnDe.load(fromFile: trainGermanDataPath)
let loadedEnglish = try WMT2014EnDe.load(fromFile: trainEnglishDataPath)

let examples = WMT2014EnDe.combine(sourceSequences: loadedGerman, targetSequences: loadedEnglish)
(self.trainExamples, self.devExamples) = WMT2014EnDe.split(examples: examples, with: 0.7)
print("creating batches")
self.trainDataIterator = trainExamples.shuffled().makeIterator()
.repeated()
.shuffled(bufferSize: 1000)
.map(mapExample)
.grouped(keyFn: { $0.tokenIds.scalarCount},
sizeFn: { _ in batchSize / maxSequenceLength},
reduceFn: WMT2014EnDe.reduceDataBatches(_:))
.prefetched(count: 2)

self.devDataIterator = devExamples.makeIterator()
.map(mapExample)
.grouped(keyFn: {$0.tokenIds.scalarCount},
sizeFn: { _ in batchSize / maxSequenceLength },
reduceFn: WMT2014EnDe.reduceDataBatches(_:))
}

static func reduceDataBatches(_ batches: [TranslationBatch]) -> TranslationBatch {
return TranslationBatch(tokenIds: Tensor(batches.map{ $0.tokenIds.squeezingShape(at: 0) }), // this should be fine
targetTokenIds: Tensor(batches.map{ $0.targetTokenIds.squeezingShape(at: 0) }),
mask: Tensor(batches.map{ $0.mask.squeezingShape(at: 0) }),
targetMask: Tensor(batches.map{ $0.targetMask.squeezingShape(at: 0) }),
targetTruth: Tensor(batches.map{ $0.targetTruth.squeezingShape(at: 0) }),
tokenCount: batches.map { $0.tokenCount }.reduce(0, +))
}

static func combine(sourceSequences: [String], targetSequences: [String]) -> [Example] {
zip(sourceSequences, targetSequences).enumerated().map { (offset: Int, element: Zip2Sequence<[String], [String]>.Element) -> Example in
Example(id: String(offset), sourceSentence: element.0, targetSentence: element.1)
}
}

static func split(examples: [Example], with trainPercent: Double) -> (train: [Example], val: [Example]) {
let splitIndex = Int(Double(examples.count) * trainPercent)
let trainSplit = examples[0..<splitIndex]
let valSplit = examples[splitIndex..<examples.count]
return (Array(trainSplit), Array(valSplit))
}
}


/// Downloads the file at `url` to `path`, if `path` does not exist.
///
/// - Parameters:
/// - from: URL to download data from.
/// - to: Destination file path.
///
/// - Returns: Boolean value indicating whether a download was
/// performed (as opposed to not needed).
public func maybeDownload(from url: URL, to destination: URL) throws {
if !FileManager.default.fileExists(atPath: destination.path) {
// Create any potentially missing directories.
try FileManager.default.createDirectory(
atPath: destination.deletingLastPathComponent().path,
withIntermediateDirectories: true)

// Create the URL session that will be used to download the dataset.
let semaphore = DispatchSemaphore(value: 0)
let delegate = DataDownloadDelegate(destinationFileUrl: destination, semaphore: semaphore)
let session = URLSession(configuration: .ephemeral, delegate: delegate, delegateQueue: nil)

// Download the data to a temporary file and then copy that file to
// the destination path.
print("Downloading \(url).")
let task = session.downloadTask(with: url)
task.resume()

// Wait for the download to finish.
semaphore.wait()
}
}

internal class DataDownloadDelegate: NSObject, URLSessionDownloadDelegate {
let destinationFileUrl: URL
let semaphore: DispatchSemaphore
let numBytesFrequency: Int64

internal var logCount: Int64 = 0

init(
destinationFileUrl: URL,
semaphore: DispatchSemaphore,
numBytesFrequency: Int64 = 1024 * 1024
) {
self.destinationFileUrl = destinationFileUrl
self.semaphore = semaphore
self.numBytesFrequency = numBytesFrequency
}

internal func urlSession(
_ session: URLSession,
downloadTask: URLSessionDownloadTask,
didFinishDownloadingTo location: URL
) -> Void {
do {
try FileManager.default.moveItem(at: location, to: destinationFileUrl)
} catch (let writeError) {
print("Error writing file \(location.path) : \(writeError)")
}
print("Downloaded successfully to \(location.path).")
semaphore.signal()
}
}

extension Array {
func chunked(into size: Int) -> [[Element]] {
return stride(from: 0, to: count, by: size).map {
Array(self[$0 ..< Swift.min($0 + size, count)])
}
}
}
142 changes: 142 additions & 0 deletions Examples/Transformer-Translation/WMT2104Training.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
//
// File.swift
//
//
// Created by Andre Carrera on 3/26/20.
//

import Foundation
import TensorFlow
import ModelSupport
import Datasets

public struct TextProcessor {
public let tokenizer: Tokenizer
public var sourceVocabulary: Vocabulary
public var targetVocabulary: Vocabulary
private let sourcePadId: Int32
private let targetPadId: Int32
private let bosId: Int32
private let eosId: Int32
private let targetUnkId: Int32
private let sourceUnkId: Int32
private let maxSequenceLength: Int
private let batchSize: Int
public init(tokenizer: Tokenizer, sourceVocabulary: Vocabulary, targetVocabulary: Vocabulary, maxSequenceLength: Int,
batchSize: Int) {
self.tokenizer = tokenizer
self.sourceVocabulary = sourceVocabulary
self.targetVocabulary = targetVocabulary
self.maxSequenceLength = maxSequenceLength
self.batchSize = batchSize

self.sourcePadId = Int32(self.sourceVocabulary.id(forToken: BLANK_WORD)!)
self.targetPadId = Int32(self.targetVocabulary.id(forToken: BLANK_WORD)!)
self.bosId = Int32(self.targetVocabulary.id(forToken: BOS_WORD)!)
self.eosId = Int32(self.targetVocabulary.id(forToken: EOS_WORD)!)
self.sourceUnkId = Int32(self.sourceVocabulary.id(forToken: UNKNOWN_WORD)!)
self.targetUnkId = Int32(self.targetVocabulary.id(forToken: UNKNOWN_WORD)!)
}
// This will take all source and target sequenes
// return batches where each batch is based on the size targets in the sequence.
// public mutating func preprocess(source: [String], target:[String], maxSequenceLength: Int, batchSize: Int) -> [TextBatch] {
// let sourcePadId = Int32(sourceVocabulary.add(token: BLANK_WORD))
// let targetPadId = Int32(targetVocabulary.add(token: BLANK_WORD))
// let bosId = Int32(targetVocabulary.add(token: BOS_WORD))
// let eosId = Int32(targetVocabulary.add(token: EOS_WORD))
//
// let tokenizedSource = source.map{ src -> [Int32] in
// let tokenizedSequence = tokenizer
// .tokenize(src)
// .prefix(maxSequenceLength)
// return tokenizedSequence.map { Int32(self.sourceVocabulary.add(token: $0))}
// }
// let tokenizedTarget = target.map{ tar -> [Int32] in
// let tokenizedSequence = tokenizer
// .tokenize(tar)
// .prefix(maxSequenceLength)
// return [bosId] + tokenizedSequence.map { Int32(self.targetVocabulary.add(token: $0))} + [eosId]
// }
//
// let sourceWithTarget = zip(tokenizedSource, tokenizedTarget).map{ $0 }
//
// let groupedBySourceSize = Dictionary(grouping: sourceWithTarget, by: { $0.0.count}).values.flatMap { (group: [([Int32], [Int32])]) -> [TextBatch] in
// let batchesFromGroup = group.chunked(into: batchSize)
// return batchesFromGroup.map { (batch: [([Int32], [Int32])]) -> TextBatch in
// // batch has multiple pairs of sources and targets
// let sourceTensor = Tensor(batch.map{ Tensor<Int32>.init($0.0) })
// let maxTargetLength = batch.map{ $0.1.count}.max() ?? 0
// // pad target length up to largest max.
// let targetTensor = Tensor(batch.map{ Tensor<Int32>.init($0.1 + [Int32](repeating: targetPadId, count: (maxTargetLength - $0.1.count))) }) // taraget tensor needs to be padded
// let textBatch = TextBatch(source: sourceTensor, target: targetTensor, sourcePadId: sourcePadId, targetPadId: targetPadId)
// return textBatch
// }
// }
//
// return groupedBySourceSize
// }
/// only pads target sequence to max sequence length,
public func preprocess(example: WMT2014EnDe.Example) -> TranslationBatch {

let encodedSource = self.tokenizer.tokenize(example.sourceSentence)
.prefix(self.maxSequenceLength)
.map{ Int32(self.sourceVocabulary.id(forToken: $0) ?? Int(self.sourceUnkId))}

var encodedTarget = self.tokenizer.tokenize(example.targetSentence)
.prefix(self.maxSequenceLength - 2)
.map{ Int32(self.targetVocabulary.id(forToken: $0) ?? Int(self.targetUnkId))}
encodedTarget = [bosId] + encodedTarget + [eosId]
let paddingCount = encodedTarget.count < maxSequenceLength ? maxSequenceLength - encodedTarget.count : 0
let padding = [Int32](repeating: targetPadId, count: paddingCount)
encodedTarget = encodedTarget + padding
assert(encodedTarget.count == maxSequenceLength, "encodedTarget.count \(encodedTarget.count) does not equal maxSequenceLength \(maxSequenceLength)")

let sourceTensor = Tensor<Int32>.init(encodedSource).expandingShape(at: 0)

// add padding to target since it will be grouped by sourcelength

// padding is going to be equal to the difference between maxSequence length and the totalEncod
let targetTensor = Tensor<Int32>.init( encodedTarget).expandingShape(at: 0)
let singleBatch = TranslationBatch(source: sourceTensor, target: targetTensor, sourcePadId: sourcePadId, targetPadId: targetPadId)

// print("original source:", example.sourceSentence)
// print("decoded source:", decode(tensor: singleBatch.tokenIds, vocab: sourceVocabulary))
//
// print("max len = \(maxSequenceLength)")
// print("encoded target \(encodedTarget.count) last: \(encodedTarget.last!)")
// print("original target:", example.targetSentence)
// print("decoded target:", decode(tensor: singleBatch.targetTokenIds, vocab: targetVocabulary))
// print("decoded truth:", decode(tensor: singleBatch.targetTruth, vocab: targetVocabulary))
return singleBatch
}

}

func decode(tensor: Tensor<Int32>, vocab: Vocabulary) -> String {
let endId = Int32(vocab.id(forToken: "</s>")!)
var words = [String]()
for scalar in tensor.scalars {
if Int(scalar) == endId {
break
} else
if let token = vocab.token(forId: Int(scalar)) {
words.append(token)
}
}
return words.joined(separator: " ")
}

extension Vocabulary {

public init(fromFile fileURL: URL, specialTokens: [String]) throws {
let vocabItems = try ( String(contentsOfFile: fileURL.path, encoding: .utf8))
.components(separatedBy: .newlines)
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
let dictionary = [String: Int](
(specialTokens + vocabItems)
.filter { $0.count > 0 }
.enumerated().map { ($0.element, $0.offset) },
uniquingKeysWith: { (v1, v2) in max(v1, v2) })
self.init(tokensToIds: dictionary )
}
}
Loading