From 98ab11844ba806c03ea569b86bf188f7c404d790 Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 10:51:41 +0100
Subject: [PATCH 01/11] GH-177: Fix tests.

---
 tests/conftest.py                    |  2 ++
 tests/test_data.py                   |  3 ---
 tests/test_embeddings.py             | 10 +++++-----
 tests/test_language_model_trainer.py |  2 --
 tests/test_model_integration.py      |  8 ++++----
 tests/test_sequence_tagger.py        |  5 +++--
 tests/test_visual.py                 |  1 -
 7 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 1937241396..f2f491b0ab 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,10 +1,12 @@
 import pytest
 from pathlib import Path
 
+
 @pytest.fixture(scope="module")
 def resources_path():
     return Path(__file__).parent / 'resources'
 
+
 @pytest.fixture(scope="module")
 def tasks_base_path(resources_path):
     return resources_path / 'tasks'
diff --git a/tests/test_data.py b/tests/test_data.py
index ea23c79440..b2aaf1af03 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -237,9 +237,6 @@ def test_label_set_confidence():
 
     assert (0.2 == label.score)
 
-    # with pytest.raises(ValueError):
-    #     label.name = ''
-
 
 def test_tagged_corpus_make_label_dictionary():
     sentence_1 = Sentence('sentence 1', labels=[Label('class_1')])
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
index 2265a09ee1..480c230505 100644
--- a/tests/test_embeddings.py
+++ b/tests/test_embeddings.py
@@ -114,7 +114,7 @@ def test_document_lstm_embeddings():
     sentence, glove, charlm = init_document_embeddings()
 
     embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove, charlm], hidden_states=128,
-                                                                bidirectional=False, use_first_representation=False)
+                                                                bidirectional=False)
 
     embeddings.embed(sentence)
 
@@ -131,7 +131,7 @@ def test_document_bidirectional_lstm_embeddings():
     sentence, glove, charlm = init_document_embeddings()
 
     embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove, charlm], hidden_states=128,
-                                                                bidirectional=True, use_first_representation=False)
+                                                                bidirectional=True)
 
     embeddings.embed(sentence)
 
@@ -148,7 +148,7 @@ def test_document_bidirectional_lstm_embeddings_using_first_representation():
     sentence, glove, charlm = init_document_embeddings()
 
     embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove, charlm], hidden_states=128,
-                                                                bidirectional=True, use_first_representation=True)
+                                                                bidirectional=True)
 
     embeddings.embed(sentence)
 
@@ -165,7 +165,7 @@ def test_document_lstm_embeddings_using_first_representation():
     sentence, glove, charlm = init_document_embeddings()
 
     embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove, charlm], hidden_states=128,
-                                                                bidirectional=False, use_first_representation=True)
+                                                                bidirectional=False)
 
     embeddings.embed(sentence)
 
@@ -221,4 +221,4 @@ def load_and_apply_char_lm_embeddings(emb_type: str):
 
         token.clear_embeddings()
 
-        assert(len(token.get_embedding()) == 0)
\ No newline at end of file
+        assert(len(token.get_embedding()) == 0)
diff --git a/tests/test_language_model_trainer.py b/tests/test_language_model_trainer.py
index 80a4bf60db..d821acc0c6 100644
--- a/tests/test_language_model_trainer.py
+++ b/tests/test_language_model_trainer.py
@@ -32,5 +32,3 @@ def test_training():
 
     # clean up results directory
     shutil.rmtree('./results', ignore_errors=True)
-
-
diff --git a/tests/test_model_integration.py b/tests/test_model_integration.py
index 3ddd9419e7..ddd49ab6f3 100644
--- a/tests/test_model_integration.py
+++ b/tests/test_model_integration.py
@@ -170,7 +170,7 @@ def test_train_load_use_classifier():
             assert (0.0 <= l.score <= 1.0)
             assert (type(l.score) is float)
 
-        loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
+    loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -204,7 +204,7 @@ def test_train_charlm_load_use_classifier():
             assert (0.0 <= l.score <= 1.0)
             assert (type(l.score) is float)
 
-        loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
+    loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -239,7 +239,7 @@ def test_train_charlm__nocache_load_use_classifier():
             assert (0.0 <= l.score <= 1.0)
             assert (type(l.score) is float)
 
-        loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
+    loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -249,4 +249,4 @@ def test_train_charlm__nocache_load_use_classifier():
     loaded_model.predict([sentence_empty])
 
     # clean up results directory
-    shutil.rmtree('./results')
\ No newline at end of file
+    shutil.rmtree('./results')
diff --git a/tests/test_sequence_tagger.py b/tests/test_sequence_tagger.py
index 47ae2fc766..c326c53c64 100644
--- a/tests/test_sequence_tagger.py
+++ b/tests/test_sequence_tagger.py
@@ -5,17 +5,18 @@
 from flair.data import Sentence
 from flair.models import SequenceTagger
 
+
 @pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
 def test_tag_sentence():
 
     # test tagging
     sentence = Sentence('I love Berlin')
 
-    tagger = SequenceTagger.load('ner')
+    tagger = SequenceTagger.load('pos')
 
     tagger.predict(sentence)
 
     # test re-tagging
-    tagger = SequenceTagger.load('pos')
+    tagger = SequenceTagger.load('ner')
 
     tagger.predict(sentence)
diff --git a/tests/test_visual.py b/tests/test_visual.py
index fed83a298a..145d8ea5fe 100644
--- a/tests/test_visual.py
+++ b/tests/test_visual.py
@@ -1,5 +1,4 @@
 import os
-import shutil
 
 import pytest
 

From c4973e996a806347a81258b07a181957a20e3cc2 Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 11:07:40 +0100
Subject: [PATCH 02/11] GH-177: Remove duplicate tests.

---
 tests/test_sequence_labeler_trainer.py | 28 -------------
 tests/test_text_classifier_trainer.py  | 56 --------------------------
 2 files changed, 84 deletions(-)
 delete mode 100644 tests/test_sequence_labeler_trainer.py
 delete mode 100644 tests/test_text_classifier_trainer.py

diff --git a/tests/test_sequence_labeler_trainer.py b/tests/test_sequence_labeler_trainer.py
deleted file mode 100644
index 6d1806135a..0000000000
--- a/tests/test_sequence_labeler_trainer.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import shutil
-
-from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
-from flair.embeddings import WordEmbeddings
-from flair.models import SequenceTagger
-from flair.trainers import SequenceTaggerTrainer
-
-
-def test_training(tasks_base_path):
-
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, tasks_base_path)
-    tag_dictionary = corpus.make_tag_dictionary('ner')
-
-    embeddings = WordEmbeddings('glove')
-
-    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
-                                            embeddings=embeddings,
-                                            tag_dictionary=tag_dictionary,
-                                            tag_type='ner',
-                                            use_crf=False)
-
-    # initialize trainer
-    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
-
-    trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=10)
-
-    # clean up results directory
-    shutil.rmtree('./results')
diff --git a/tests/test_text_classifier_trainer.py b/tests/test_text_classifier_trainer.py
deleted file mode 100644
index 5a4703db0e..0000000000
--- a/tests/test_text_classifier_trainer.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import shutil
-
-from flair.data import Sentence
-
-from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
-from flair.embeddings import WordEmbeddings, DocumentMeanEmbeddings, DocumentLSTMEmbeddings
-from flair.models.text_classification_model import TextClassifier
-from flair.trainers.text_classification_trainer import TextClassifierTrainer
-
-
-def test_text_classifier_single_label(tasks_base_path):
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path)
-    label_dict = corpus.make_label_dictionary()
-
-    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
-    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False)
-
-    model = TextClassifier(document_embeddings, label_dict, False)
-
-    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
-    trainer.train('./results', max_epochs=2)
-
-    sentence = Sentence("Berlin is a really nice city.")
-
-    for s in model.predict(sentence):
-        for l in s.labels:
-            assert(l.value is not None)
-            assert(0.0 <= l.score <= 1.0)
-            assert(type(l.score) is float)
-
-    # clean up results directory
-    shutil.rmtree('./results')
-
-
-def test_text_classifier_mulit_label(tasks_base_path):
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path)
-    label_dict = corpus.make_label_dictionary()
-
-    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
-    document_embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings([glove_embedding])
-
-    model = TextClassifier(document_embeddings, label_dict, True)
-
-    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
-    trainer.train('./results', max_epochs=2)
-
-    sentence = Sentence("Berlin is a really nice city.")
-
-    for s in model.predict(sentence):
-        for l in s.labels:
-            assert(l.value is not None)
-            assert(0.0 <= l.score <= 1.0)
-            assert(type(l.score) is float)
-
-    # clean up results directory
-    shutil.rmtree('./results')

From 2808bc19159206c2441deb17e8e51a2d4b204352 Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 11:08:57 +0100
Subject: [PATCH 03/11] GH-177: Mark slow tests.

---
 tests/conftest.py                    | 16 +++++++++++
 tests/test_embeddings.py             | 41 ++++++++++++++--------------
 tests/test_language_model_trainer.py |  3 ++
 tests/test_model_integration.py      |  9 ++++++
 tests/test_sequence_tagger.py        |  7 ++---
 tests/test_visual.py                 |  6 ++--
 6 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index f2f491b0ab..9d3db9ac56 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,3 +10,19 @@ def resources_path():
 @pytest.fixture(scope="module")
 def tasks_base_path(resources_path):
     return resources_path / 'tasks'
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--runslow", action="store_true", default=False, help="run slow tests"
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--runslow"):
+        # --runslow given in cli: do not skip slow tests
+        return
+    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+    for item in items:
+        if "slow" in item.keywords:
+            item.add_marker(skip_slow)
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
index 480c230505..e598cea217 100644
--- a/tests/test_embeddings.py
+++ b/tests/test_embeddings.py
@@ -1,4 +1,3 @@
-import os
 import pytest
 
 from flair.embeddings import WordEmbeddings, TokenEmbeddings, CharLMEmbeddings, StackedEmbeddings, \
@@ -7,77 +6,77 @@
 from flair.data import Sentence
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_en_glove():
     load_and_apply_word_embeddings('en-glove')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_en_numberbatch():
     load_and_apply_word_embeddings('en-numberbatch')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_en_extvec():
     load_and_apply_word_embeddings('en-extvec')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_en_crawl():
     load_and_apply_word_embeddings('en-crawl')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_en_news():
     load_and_apply_word_embeddings('en-news')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_de_fasttext():
     load_and_apply_word_embeddings('de-fasttext')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_de_numberbatch():
     load_and_apply_word_embeddings('de-numberbatch')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_sv_fasttext():
     load_and_apply_word_embeddings('sv-fasttext')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_news_forward():
     load_and_apply_char_lm_embeddings('news-forward')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_news_backward():
     load_and_apply_char_lm_embeddings('news-backward')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_mix_forward():
     load_and_apply_char_lm_embeddings('mix-forward')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_mix_backward():
     load_and_apply_char_lm_embeddings('mix-backward')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_german_forward():
     load_and_apply_char_lm_embeddings('german-forward')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_german_backward():
     load_and_apply_char_lm_embeddings('german-backward')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_stacked_embeddings():
     text = 'I love Berlin.'
     sentence: Sentence = Sentence(text)
@@ -109,7 +108,7 @@ def init_document_embeddings():
     return sentence, glove, charlm
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_document_lstm_embeddings():
     sentence, glove, charlm = init_document_embeddings()
 
@@ -126,7 +125,7 @@ def test_document_lstm_embeddings():
     assert (len(sentence.get_embedding()) == 0)
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_document_bidirectional_lstm_embeddings():
     sentence, glove, charlm = init_document_embeddings()
 
@@ -143,7 +142,7 @@ def test_document_bidirectional_lstm_embeddings():
     assert (len(sentence.get_embedding()) == 0)
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_document_bidirectional_lstm_embeddings_using_first_representation():
     sentence, glove, charlm = init_document_embeddings()
 
@@ -160,7 +159,7 @@ def test_document_bidirectional_lstm_embeddings_using_first_representation():
     assert (len(sentence.get_embedding()) == 0)
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_document_lstm_embeddings_using_first_representation():
     sentence, glove, charlm = init_document_embeddings()
 
@@ -177,7 +176,7 @@ def test_document_lstm_embeddings_using_first_representation():
     assert (len(sentence.get_embedding()) == 0)
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_document_mean_embeddings():
     text = 'I love Berlin. Berlin is a great place to live.'
     sentence: Sentence = Sentence(text)
diff --git a/tests/test_language_model_trainer.py b/tests/test_language_model_trainer.py
index d821acc0c6..261377a88a 100644
--- a/tests/test_language_model_trainer.py
+++ b/tests/test_language_model_trainer.py
@@ -1,4 +1,6 @@
 import shutil
+import pytest
+
 from pathlib import Path
 
 from flair.data import Dictionary, Sentence
@@ -7,6 +9,7 @@
 from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
 
 
+@pytest.mark.slow
 def test_training():
     # get default dictionary
     dictionary: Dictionary = Dictionary.load('chars')
diff --git a/tests/test_model_integration.py b/tests/test_model_integration.py
index ddd49ab6f3..ce069b0e89 100644
--- a/tests/test_model_integration.py
+++ b/tests/test_model_integration.py
@@ -1,4 +1,5 @@
 import os
+import pytest
 import shutil
 
 from flair.data import Sentence
@@ -8,6 +9,7 @@
 from flair.trainers import SequenceTaggerTrainer, TextClassifierTrainer
 
 
+@pytest.mark.slow
 def test_train_load_use_tagger():
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
@@ -39,6 +41,7 @@ def test_train_load_use_tagger():
     shutil.rmtree('./results')
 
 
+@pytest.mark.slow
 def test_train_charlm_load_use_tagger():
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
@@ -70,6 +73,7 @@ def test_train_charlm_load_use_tagger():
     shutil.rmtree('./results')
 
 
+@pytest.mark.slow
 def test_train_charlm_changed_chache_load_use_tagger():
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
@@ -106,6 +110,7 @@ def test_train_charlm_changed_chache_load_use_tagger():
     shutil.rmtree('./results')
 
 
+@pytest.mark.slow
 def test_train_charlm_nochache_load_use_tagger():
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
@@ -137,6 +142,7 @@ def test_train_charlm_nochache_load_use_tagger():
     shutil.rmtree('./results')
 
 
+@pytest.mark.slow
 def test_load_use_serialized_tagger():
 
     loaded_model: SequenceTagger = SequenceTagger.load('ner')
@@ -149,6 +155,7 @@ def test_load_use_serialized_tagger():
     loaded_model.predict([sentence_empty])
 
 
+@pytest.mark.slow
 def test_train_load_use_classifier():
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
     label_dict = corpus.make_label_dictionary()
@@ -183,6 +190,7 @@ def test_train_load_use_classifier():
     shutil.rmtree('./results')
 
 
+@pytest.mark.slow
 def test_train_charlm_load_use_classifier():
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
     label_dict = corpus.make_label_dictionary()
@@ -217,6 +225,7 @@ def test_train_charlm_load_use_classifier():
     shutil.rmtree('./results')
 
 
+@pytest.mark.slow
 def test_train_charlm__nocache_load_use_classifier():
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
     label_dict = corpus.make_label_dictionary()
diff --git a/tests/test_sequence_tagger.py b/tests/test_sequence_tagger.py
index c326c53c64..1792149791 100644
--- a/tests/test_sequence_tagger.py
+++ b/tests/test_sequence_tagger.py
@@ -1,12 +1,7 @@
-import os
-
-import pytest
-
 from flair.data import Sentence
 from flair.models import SequenceTagger
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
 def test_tag_sentence():
 
     # test tagging
@@ -16,6 +11,8 @@ def test_tag_sentence():
 
     tagger.predict(sentence)
 
+    sentence.clear_embeddings()
+
     # test re-tagging
     tagger = SequenceTagger.load('ner')
 
diff --git a/tests/test_visual.py b/tests/test_visual.py
index 145d8ea5fe..5cd91056fb 100644
--- a/tests/test_visual.py
+++ b/tests/test_visual.py
@@ -11,7 +11,7 @@
 from flair.visual.training_curves import Plotter
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_visualize_word_emeddings(resources_path):
 
     with open('./resources/visual/snippet.txt') as f:
@@ -31,7 +31,7 @@ def test_visualize_word_emeddings(resources_path):
     os.remove('./resources/visual/sentence_embeddings.html')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_visualize_word_emeddings():
 
     with open('./resources/visual/snippet.txt') as f:
@@ -48,7 +48,7 @@ def test_visualize_word_emeddings():
     os.remove('./resources/visual/sentence_embeddings.html')
 
 
-@pytest.mark.skipif("TRAVIS" in os.environ and os.environ["TRAVIS"] == "true", reason="Skipping this test on Travis CI.")
+@pytest.mark.slow
 def test_visualize():
 
     with open('./resources/visual/snippet.txt') as f:

From 0ecd02f5dfee5155fac13f47cd831a156e5f1f1c Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 11:18:41 +0100
Subject: [PATCH 04/11] GH-177: Move test train language model.

---
 tests/test_language_model_trainer.py | 37 --------------------------
 tests/test_model_integration.py      | 39 +++++++++++++++++++++++++---
 2 files changed, 35 insertions(+), 41 deletions(-)
 delete mode 100644 tests/test_language_model_trainer.py

diff --git a/tests/test_language_model_trainer.py b/tests/test_language_model_trainer.py
deleted file mode 100644
index 261377a88a..0000000000
--- a/tests/test_language_model_trainer.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import shutil
-import pytest
-
-from pathlib import Path
-
-from flair.data import Dictionary, Sentence
-from flair.embeddings import CharLMEmbeddings
-from flair.models import LanguageModel
-from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
-
-
-@pytest.mark.slow
-def test_training():
-    # get default dictionary
-    dictionary: Dictionary = Dictionary.load('chars')
-
-    # init forward LM with 128 hidden states and 1 layer
-    language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)
-
-    # get the example corpus and process at character level in forward direction
-    corpus: TextCorpus = TextCorpus(str(Path(__file__).parent / 'resources/corpora/lorem_ipsum'),
-                                    dictionary,
-                                    language_model.is_forward_lm,
-                                    character_level=True)
-
-    # train the language model
-    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus)
-    trainer.train('./results', sequence_length=10, mini_batch_size=10, max_epochs=5)
-
-    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
-    char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt')
-    sentence = Sentence('I love Berlin')
-    char_lm_embeddings.embed(sentence)
-    print(sentence[1].embedding.size())
-
-    # clean up results directory
-    shutil.rmtree('./results', ignore_errors=True)
diff --git a/tests/test_model_integration.py b/tests/test_model_integration.py
index ce069b0e89..959a519164 100644
--- a/tests/test_model_integration.py
+++ b/tests/test_model_integration.py
@@ -1,12 +1,15 @@
 import os
-import pytest
 import shutil
+import pytest
 
-from flair.data import Sentence
+from pathlib import Path
+
+from flair.data import Dictionary, Sentence
 from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
 from flair.embeddings import WordEmbeddings, CharLMEmbeddings, DocumentLSTMEmbeddings, TokenEmbeddings
-from flair.models import SequenceTagger, TextClassifier
+from flair.models import SequenceTagger, TextClassifier, LanguageModel
 from flair.trainers import SequenceTaggerTrainer, TextClassifierTrainer
+from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
 
 
 @pytest.mark.slow
@@ -226,7 +229,7 @@ def test_train_charlm_load_use_classifier():
 
 
 @pytest.mark.slow
-def test_train_charlm__nocache_load_use_classifier():
+def test_train_charlm_nocache_load_use_classifier():
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
     label_dict = corpus.make_label_dictionary()
 
@@ -259,3 +262,31 @@ def test_train_charlm__nocache_load_use_classifier():
 
     # clean up results directory
     shutil.rmtree('./results')
+
+
+@pytest.mark.slow
+def test_train_language_model():
+    # get default dictionary
+    dictionary: Dictionary = Dictionary.load('chars')
+
+    # init forward LM with 128 hidden states and 1 layer
+    language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)
+
+    # get the example corpus and process at character level in forward direction
+    corpus: TextCorpus = TextCorpus(str(Path(__file__).parent / 'resources/corpora/lorem_ipsum'),
+                                    dictionary,
+                                    language_model.is_forward_lm,
+                                    character_level=True)
+
+    # train the language model
+    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus)
+    trainer.train('./results', sequence_length=10, mini_batch_size=10, max_epochs=5)
+
+    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
+    char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt')
+    sentence = Sentence('I love Berlin')
+    char_lm_embeddings.embed(sentence)
+    print(sentence[1].embedding.size())
+
+    # clean up results directory
+    shutil.rmtree('./results', ignore_errors=True)

From fe5aa16b209bad386f03084e070b8b030968ffb5 Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 11:59:36 +0100
Subject: [PATCH 05/11] GH-177: Fix warning and use pathlib

---
 flair/embeddings.py             |  8 +--
 tests/conftest.py               |  5 ++
 tests/test_embeddings.py        | 21 ++++----
 tests/test_model_integration.py | 86 ++++++++++++++++++---------------
 tests/test_sequence_tagger.py   | 19 --------
 tests/test_text_classifier.py   |  1 -
 tests/test_utils.py             |  1 -
 7 files changed, 65 insertions(+), 76 deletions(-)
 delete mode 100644 tests/test_sequence_tagger.py

diff --git a/flair/embeddings.py b/flair/embeddings.py
index 5316c29752..80b50d625e 100644
--- a/flair/embeddings.py
+++ b/flair/embeddings.py
@@ -207,10 +207,10 @@ def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
                     word_embedding = self.precomputed_word_embeddings[token.text]
                 elif token.text.lower() in self.precomputed_word_embeddings:
                     word_embedding = self.precomputed_word_embeddings[token.text.lower()]
-                elif re.sub('\d', '#', token.text.lower()) in self.precomputed_word_embeddings:
-                    word_embedding = self.precomputed_word_embeddings[re.sub('\d', '#', token.text.lower())]
-                elif re.sub('\d', '0', token.text.lower()) in self.precomputed_word_embeddings:
-                    word_embedding = self.precomputed_word_embeddings[re.sub('\d', '0', token.text.lower())]
+                elif re.sub(r'\d', '#', token.text.lower()) in self.precomputed_word_embeddings:
+                    word_embedding = self.precomputed_word_embeddings[re.sub(r'\d', '#', token.text.lower())]
+                elif re.sub(r'\d', '0', token.text.lower()) in self.precomputed_word_embeddings:
+                    word_embedding = self.precomputed_word_embeddings[re.sub(r'\d', '0', token.text.lower())]
                 else:
                     word_embedding = np.zeros(self.embedding_length, dtype='float')
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 9d3db9ac56..48821d4563 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,6 +12,11 @@ def tasks_base_path(resources_path):
     return resources_path / 'tasks'
 
 
+@pytest.fixture(scope="module")
+def results_base_path(resources_path):
+    return resources_path / 'results'
+
+
 def pytest_addoption(parser):
     parser.addoption(
         "--runslow", action="store_true", default=False, help="run slow tests"
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
index e598cea217..c56b7639e5 100644
--- a/tests/test_embeddings.py
+++ b/tests/test_embeddings.py
@@ -97,17 +97,6 @@ def test_stacked_embeddings():
         assert(len(token.get_embedding()) == 0)
 
 
-@pytest.fixture
-def init_document_embeddings():
-    text = 'I love Berlin. Berlin is a great place to live.'
-    sentence: Sentence = Sentence(text)
-
-    glove: TokenEmbeddings = WordEmbeddings('en-glove')
-    charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward')
-
-    return sentence, glove, charlm
-
-
 @pytest.mark.slow
 def test_document_lstm_embeddings():
     sentence, glove, charlm = init_document_embeddings()
@@ -195,6 +184,16 @@ def test_document_mean_embeddings():
     assert (len(sentence.get_embedding()) == 0)
 
 
+def init_document_embeddings():
+    text = 'I love Berlin. Berlin is a great place to live.'
+    sentence: Sentence = Sentence(text)
+
+    glove: TokenEmbeddings = WordEmbeddings('en-glove')
+    charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward')
+
+    return sentence, glove, charlm
+
+
 def load_and_apply_word_embeddings(emb_type: str):
     text = 'I love Berlin.'
     sentence: Sentence = Sentence(text)
diff --git a/tests/test_model_integration.py b/tests/test_model_integration.py
index 959a519164..1a714a0807 100644
--- a/tests/test_model_integration.py
+++ b/tests/test_model_integration.py
@@ -2,8 +2,6 @@
 import shutil
 import pytest
 
-from pathlib import Path
-
 from flair.data import Dictionary, Sentence
 from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
 from flair.embeddings import WordEmbeddings, CharLMEmbeddings, DocumentLSTMEmbeddings, TokenEmbeddings
@@ -13,7 +11,7 @@
 
 
 @pytest.mark.slow
-def test_train_load_use_tagger():
+def test_train_load_use_tagger(results_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
     tag_dictionary = corpus.make_tag_dictionary('ner')
@@ -29,9 +27,9 @@ def test_train_load_use_tagger():
     # initialize trainer
     trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
 
-    trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=3)
+    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=3)
 
-    loaded_model: SequenceTagger = SequenceTagger.load_from_file('./results/final-model.pt')
+    loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -41,11 +39,11 @@ def test_train_load_use_tagger():
     loaded_model.predict([sentence_empty])
 
     # clean up results directory
-    shutil.rmtree('./results')
+    shutil.rmtree(results_base_path)
 
 
 @pytest.mark.slow
-def test_train_charlm_load_use_tagger():
+def test_train_charlm_load_use_tagger(results_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
     tag_dictionary = corpus.make_tag_dictionary('ner')
@@ -61,9 +59,9 @@ def test_train_charlm_load_use_tagger():
     # initialize trainer
     trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
 
-    trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=3)
+    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=3)
 
-    loaded_model: SequenceTagger = SequenceTagger.load_from_file('./results/final-model.pt')
+    loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -73,18 +71,19 @@ def test_train_charlm_load_use_tagger():
     loaded_model.predict([sentence_empty])
 
     # clean up results directory
-    shutil.rmtree('./results')
+    shutil.rmtree(results_base_path)
 
 
 @pytest.mark.slow
-def test_train_charlm_changed_chache_load_use_tagger():
+def test_train_charlm_changed_chache_load_use_tagger(results_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
     tag_dictionary = corpus.make_tag_dictionary('ner')
 
     # make a temporary cache directory that we remove afterwards
-    os.makedirs('./results/cache/', exist_ok=True)
-    embeddings = CharLMEmbeddings('news-forward-fast', cache_directory='./results/cache/')
+    cache_dir = results_base_path / 'cache'
+    os.makedirs(cache_dir, exist_ok=True)
+    embeddings = CharLMEmbeddings('news-forward-fast', cache_directory=cache_dir)
 
     tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                             embeddings=embeddings,
@@ -95,12 +94,12 @@ def test_train_charlm_changed_chache_load_use_tagger():
     # initialize trainer
     trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
 
-    trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=3)
+    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=3)
 
     # remove the cache directory
-    shutil.rmtree('./results/cache')
+    shutil.rmtree(cache_dir)
 
-    loaded_model: SequenceTagger = SequenceTagger.load_from_file('./results/final-model.pt')
+    loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -110,11 +109,11 @@ def test_train_charlm_changed_chache_load_use_tagger():
     loaded_model.predict([sentence_empty])
 
     # clean up results directory
-    shutil.rmtree('./results')
+    shutil.rmtree(results_base_path)
 
 
 @pytest.mark.slow
-def test_train_charlm_nochache_load_use_tagger():
+def test_train_charlm_nochache_load_use_tagger(results_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
     tag_dictionary = corpus.make_tag_dictionary('ner')
@@ -130,9 +129,9 @@ def test_train_charlm_nochache_load_use_tagger():
     # initialize trainer
     trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
 
-    trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=3)
+    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=3)
 
-    loaded_model: SequenceTagger = SequenceTagger.load_from_file('./results/final-model.pt')
+    loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -142,10 +141,9 @@ def test_train_charlm_nochache_load_use_tagger():
     loaded_model.predict([sentence_empty])
 
     # clean up results directory
-    shutil.rmtree('./results')
+    shutil.rmtree(results_base_path)
 
 
-@pytest.mark.slow
 def test_load_use_serialized_tagger():
 
     loaded_model: SequenceTagger = SequenceTagger.load('ner')
@@ -157,9 +155,18 @@ def test_load_use_serialized_tagger():
     loaded_model.predict([sentence, sentence_empty])
     loaded_model.predict([sentence_empty])
 
+    sentence.clear_embeddings()
+    sentence_empty.clear_embeddings()
+
+    loaded_model: SequenceTagger = SequenceTagger.load('pos')
+
+    loaded_model.predict(sentence)
+    loaded_model.predict([sentence, sentence_empty])
+    loaded_model.predict([sentence_empty])
+
 
 @pytest.mark.slow
-def test_train_load_use_classifier():
+def test_train_load_use_classifier(results_base_path):
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
     label_dict = corpus.make_label_dictionary()
 
@@ -170,7 +177,7 @@ def test_train_load_use_classifier():
     model = TextClassifier(document_embeddings, label_dict, False)
 
     trainer = TextClassifierTrainer(model, corpus, label_dict, False)
-    trainer.train('./results', max_epochs=2)
+    trainer.train(str(results_base_path), max_epochs=2)
 
     sentence = Sentence("Berlin is a really nice city.")
 
@@ -180,7 +187,7 @@ def test_train_load_use_classifier():
             assert (0.0 <= l.score <= 1.0)
             assert (type(l.score) is float)
 
-    loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
+    loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -190,11 +197,11 @@ def test_train_load_use_classifier():
     loaded_model.predict([sentence_empty])
 
     # clean up results directory
-    shutil.rmtree('./results')
+    shutil.rmtree(results_base_path)
 
 
 @pytest.mark.slow
-def test_train_charlm_load_use_classifier():
+def test_train_charlm_load_use_classifier(results_base_path):
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
     label_dict = corpus.make_label_dictionary()
 
@@ -205,7 +212,7 @@ def test_train_charlm_load_use_classifier():
     model = TextClassifier(document_embeddings, label_dict, False)
 
     trainer = TextClassifierTrainer(model, corpus, label_dict, False)
-    trainer.train('./results', max_epochs=2)
+    trainer.train(str(results_base_path), max_epochs=2)
 
     sentence = Sentence("Berlin is a really nice city.")
 
@@ -215,7 +222,7 @@ def test_train_charlm_load_use_classifier():
             assert (0.0 <= l.score <= 1.0)
             assert (type(l.score) is float)
 
-    loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
+    loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -225,11 +232,11 @@ def test_train_charlm_load_use_classifier():
     loaded_model.predict([sentence_empty])
 
     # clean up results directory
-    shutil.rmtree('./results')
+    shutil.rmtree(results_base_path)
 
 
 @pytest.mark.slow
-def test_train_charlm_nocache_load_use_classifier():
+def test_train_charlm_nocache_load_use_classifier(results_base_path):
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
     label_dict = corpus.make_label_dictionary()
 
@@ -241,7 +248,7 @@ def test_train_charlm_nocache_load_use_classifier():
     model = TextClassifier(document_embeddings, label_dict, False)
 
     trainer = TextClassifierTrainer(model, corpus, label_dict, False)
-    trainer.train('./results', max_epochs=2)
+    trainer.train(str(results_base_path), max_epochs=2)
 
     sentence = Sentence("Berlin is a really nice city.")
 
@@ -251,7 +258,7 @@ def test_train_charlm_nocache_load_use_classifier():
             assert (0.0 <= l.score <= 1.0)
             assert (type(l.score) is float)
 
-    loaded_model = TextClassifier.load_from_file('./results/final-model.pt')
+    loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt')
 
     sentence = Sentence('I love Berlin')
     sentence_empty = Sentence('       ')
@@ -261,11 +268,11 @@ def test_train_charlm_nocache_load_use_classifier():
     loaded_model.predict([sentence_empty])
 
     # clean up results directory
-    shutil.rmtree('./results')
+    shutil.rmtree(results_base_path)
 
 
 @pytest.mark.slow
-def test_train_language_model():
+def test_train_language_model(results_base_path, resources_path):
     # get default dictionary
     dictionary: Dictionary = Dictionary.load('chars')
 
@@ -273,20 +280,19 @@ def test_train_language_model():
     language_model: LanguageModel = LanguageModel(dictionary, is_forward_lm=True, hidden_size=128, nlayers=1)
 
     # get the example corpus and process at character level in forward direction
-    corpus: TextCorpus = TextCorpus(str(Path(__file__).parent / 'resources/corpora/lorem_ipsum'),
+    corpus: TextCorpus = TextCorpus(str(resources_path / 'corpora/lorem_ipsum'),
                                     dictionary,
                                     language_model.is_forward_lm,
                                     character_level=True)
 
     # train the language model
     trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus)
-    trainer.train('./results', sequence_length=10, mini_batch_size=10, max_epochs=5)
+    trainer.train(str(results_base_path), sequence_length=10, mini_batch_size=10, max_epochs=5)
 
     # use the character LM as embeddings to embed the example sentence 'I love Berlin'
-    char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt')
+    char_lm_embeddings = CharLMEmbeddings(str(results_base_path / 'best-lm.pt'))
     sentence = Sentence('I love Berlin')
     char_lm_embeddings.embed(sentence)
-    print(sentence[1].embedding.size())
 
     # clean up results directory
-    shutil.rmtree('./results', ignore_errors=True)
+    shutil.rmtree(results_base_path, ignore_errors=True)
diff --git a/tests/test_sequence_tagger.py b/tests/test_sequence_tagger.py
deleted file mode 100644
index 1792149791..0000000000
--- a/tests/test_sequence_tagger.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from flair.data import Sentence
-from flair.models import SequenceTagger
-
-
-def test_tag_sentence():
-
-    # test tagging
-    sentence = Sentence('I love Berlin')
-
-    tagger = SequenceTagger.load('pos')
-
-    tagger.predict(sentence)
-
-    sentence.clear_embeddings()
-
-    # test re-tagging
-    tagger = SequenceTagger.load('ner')
-
-    tagger.predict(sentence)
diff --git a/tests/test_text_classifier.py b/tests/test_text_classifier.py
index d8dfa36bc5..1336cb96a1 100644
--- a/tests/test_text_classifier.py
+++ b/tests/test_text_classifier.py
@@ -7,7 +7,6 @@
 from flair.models.text_classification_model import TextClassifier
 
 
-@pytest.fixture
 def init(tasks_base_path) -> Tuple[TaggedCorpus, Dictionary, TextClassifier]:
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS, tasks_base_path)
     label_dict = corpus.make_label_dictionary()
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 7dad1ed69f..76d54671a8 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -4,7 +4,6 @@
 from flair.training_utils import calculate_micro_avg_metric, calculate_class_metrics, convert_labels_to_one_hot
 
 
-@pytest.fixture
 def init():
     y_true = [[0, 1, 1], [0, 0, 1], [1, 1, 0]]
     y_pred = [[0, 1, 1], [0, 0, 0], [1, 0, 0]]

From 236066efe2f86d387d13a07ef80a3d0e558c880d Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 12:00:06 +0100
Subject: [PATCH 06/11] GH-177: Update documentation

---
 .travis.yml | 2 --
 README.md   | 8 ++++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4063862c02..5923347cea 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,3 @@
-env:
-  - TRAVIS=true
 language: python
 sudo: false
 python:
diff --git a/README.md b/README.md
index 691163ef5e..2b75d55018 100644
--- a/README.md
+++ b/README.md
@@ -148,11 +148,15 @@ the code should hopefully be easy.
 
 You need [Pipenv](https://pipenv.readthedocs.io/) for this:
 
-```
+```bash
 pipenv install --dev && pipenv shell
-TRAVIS=true pytest
+pytest
 ```
 
+If you also want to run the slow tests, execute:
+```bash
+pytest --runslow
+```
 
 ## [License](/LICENSE)
 

From 1e2242f8c85ecc5fad0157eacf1cb44289ddeed9 Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 12:58:54 +0100
Subject: [PATCH 07/11] GH-177: Fix tests - Use pathlib

---
 tests/test_model_integration.py | 31 +++++++++++++++++--------------
 tests/test_visual.py            | 22 +++++++++++-----------
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/tests/test_model_integration.py b/tests/test_model_integration.py
index 1a714a0807..88e3ce14cb 100644
--- a/tests/test_model_integration.py
+++ b/tests/test_model_integration.py
@@ -11,9 +11,9 @@
 
 
 @pytest.mark.slow
-def test_train_load_use_tagger(results_base_path):
+def test_train_load_use_tagger(results_base_path, tasks_base_path):
 
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
+    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path)
     tag_dictionary = corpus.make_tag_dictionary('ner')
 
     embeddings = WordEmbeddings('glove')
@@ -43,9 +43,9 @@ def test_train_load_use_tagger(results_base_path):
 
 
 @pytest.mark.slow
-def test_train_charlm_load_use_tagger(results_base_path):
+def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path):
 
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
+    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path)
     tag_dictionary = corpus.make_tag_dictionary('ner')
 
     embeddings = CharLMEmbeddings('news-forward-fast')
@@ -75,9 +75,9 @@ def test_train_charlm_load_use_tagger(results_base_path):
 
 
 @pytest.mark.slow
-def test_train_charlm_changed_chache_load_use_tagger(results_base_path):
+def test_train_charlm_changed_chache_load_use_tagger(results_base_path, tasks_base_path):
 
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
+    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path)
     tag_dictionary = corpus.make_tag_dictionary('ner')
 
     # make a temporary cache directory that we remove afterwards
@@ -113,9 +113,9 @@ def test_train_charlm_changed_chache_load_use_tagger(results_base_path):
 
 
 @pytest.mark.slow
-def test_train_charlm_nochache_load_use_tagger(results_base_path):
+def test_train_charlm_nochache_load_use_tagger(results_base_path, tasks_base_path):
 
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION)
+    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path)
     tag_dictionary = corpus.make_tag_dictionary('ner')
 
     embeddings = CharLMEmbeddings('news-forward-fast', use_cache=False)
@@ -166,8 +166,9 @@ def test_load_use_serialized_tagger():
 
 
 @pytest.mark.slow
-def test_train_load_use_classifier(results_base_path):
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
+def test_train_load_use_classifier(results_base_path, tasks_base_path):
+
+    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, base_path=tasks_base_path)
     label_dict = corpus.make_label_dictionary()
 
     glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
@@ -201,8 +202,9 @@ def test_train_load_use_classifier(results_base_path):
 
 
 @pytest.mark.slow
-def test_train_charlm_load_use_classifier(results_base_path):
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
+def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
+
+    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, base_path=tasks_base_path)
     label_dict = corpus.make_label_dictionary()
 
     glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast')
@@ -236,8 +238,9 @@ def test_train_charlm_load_use_classifier(results_base_path):
 
 
 @pytest.mark.slow
-def test_train_charlm_nocache_load_use_classifier(results_base_path):
-    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
+def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path):
+
+    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, base_path=tasks_base_path)
     label_dict = corpus.make_label_dictionary()
 
     glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast', use_cache=False)
diff --git a/tests/test_visual.py b/tests/test_visual.py
index 5cd91056fb..c88d6cd5f8 100644
--- a/tests/test_visual.py
+++ b/tests/test_visual.py
@@ -14,7 +14,7 @@
 @pytest.mark.slow
 def test_visualize_word_emeddings(resources_path):
 
-    with open('./resources/visual/snippet.txt') as f:
+    with open(resources_path / 'visual/snippet.txt') as f:
         sentences = [x for x in f.read().split('\n') if x]
 
     sentences = [Sentence(x) for x in sentences]
@@ -25,16 +25,16 @@ def test_visualize_word_emeddings(resources_path):
     embeddings = StackedEmbeddings([charlm_embedding_backward, charlm_embedding_forward])
 
     visualizer = Visualizer()
-    visualizer.visualize_word_emeddings(embeddings, sentences, './resources/visual/sentence_embeddings.html')
+    visualizer.visualize_word_emeddings(embeddings, sentences, str(resources_path / 'visual/sentence_embeddings.html'))
 
     # clean up directory
-    os.remove('./resources/visual/sentence_embeddings.html')
+    os.remove(resources_path / 'visual/sentence_embeddings.html')
 
 
 @pytest.mark.slow
-def test_visualize_word_emeddings():
+def test_visualize_word_emeddings(resources_path):
 
-    with open('./resources/visual/snippet.txt') as f:
+    with open(resources_path / 'visual/snippet.txt') as f:
         sentences = [x for x in f.read().split('\n') if x]
 
     sentences = [Sentence(x) for x in sentences]
@@ -42,16 +42,16 @@ def test_visualize_word_emeddings():
     charlm_embedding_forward = CharLMEmbeddings('news-forward')
 
     visualizer = Visualizer()
-    visualizer.visualize_char_emeddings(charlm_embedding_forward, sentences, './resources/visual/sentence_embeddings.html')
+    visualizer.visualize_char_emeddings(charlm_embedding_forward, sentences, str(resources_path / 'visual/sentence_embeddings.html'))
 
     # clean up directory
-    os.remove('./resources/visual/sentence_embeddings.html')
+    os.remove(resources_path / 'visual/sentence_embeddings.html')
 
 
 @pytest.mark.slow
-def test_visualize():
+def test_visualize(resources_path):
 
-    with open('./resources/visual/snippet.txt') as f:
+    with open(resources_path / 'visual/snippet.txt') as f:
         sentences = [x for x in f.read().split('\n') if x]
 
     sentences = [Sentence(x) for x in sentences]
@@ -73,10 +73,10 @@ def test_visualize():
     trans_ = tSNE()
     reduced = trans_.fit(X)
 
-    visualizer.visualize(reduced, contexts, './resources/visual/char_embeddings.html')
+    visualizer.visualize(reduced, contexts, str(resources_path / 'visual/char_embeddings.html'))
 
     # clean up directory
-    os.remove('./resources/visual/char_embeddings.html')
+    os.remove(resources_path / 'visual/char_embeddings.html')
 
 
 def test_highlighter(resources_path):

From cd9d28dfc27fdd2d8e2d23f66bc258fc318df96b Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 13:06:08 +0100
Subject: [PATCH 08/11] GH-177: Run slowtest on travis.

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 5923347cea..132f86af05 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,4 +6,4 @@ install:
   - pip install -r requirements.txt -q
 before_script: cd tests
 script:
-  - pytest
\ No newline at end of file
+  - pytest --runslow

From e03fb19e006d7f40f292a6c3f2a265a658a8696f Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Thu, 1 Nov 2018 13:17:24 +0100
Subject: [PATCH 09/11] GH-177: Revert running slowtest.

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 132f86af05..7d10d62821 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,4 +6,4 @@ install:
   - pip install -r requirements.txt -q
 before_script: cd tests
 script:
-  - pytest --runslow
+  - pytest

From a576b492f957d9242985b688ea7be565688f5c52 Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Tue, 6 Nov 2018 13:33:44 +0100
Subject: [PATCH 10/11] GH-177: Run integration tests on travis.

---
 .travis.yml                     |  2 +-
 README.md                       | 11 ++++++++++-
 tests/conftest.py               | 22 +++++++++++++++------
 tests/test_model_integration.py | 35 +++++++++++++++++----------------
 4 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 7d10d62821..099372cca2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,4 +6,4 @@ install:
   - pip install -r requirements.txt -q
 before_script: cd tests
 script:
-  - pytest
+  - pytest --runintegration
diff --git a/README.md b/README.md
index 2b75d55018..3f07be3fc6 100644
--- a/README.md
+++ b/README.md
@@ -153,11 +153,20 @@ pipenv install --dev && pipenv shell
 pytest
 ```
 
-If you also want to run the slow tests, execute:
+To run integration tests execute:
+```bash
+pytest --runintegration
+```
+The integration tests will train small models.
+Afterwards, the trained model will be loaded for prediction.
+
+To also run slow tests, such as loading and using the embeddings provided by flair, you should execute:
 ```bash
 pytest --runslow
 ```
 
+
+
 ## [License](/LICENSE)
 
 The MIT License (MIT)
diff --git a/tests/conftest.py b/tests/conftest.py
index 48821d4563..211931ce52 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,13 +21,23 @@ def pytest_addoption(parser):
     parser.addoption(
         "--runslow", action="store_true", default=False, help="run slow tests"
     )
+    parser.addoption(
+        "--runintegration", action="store_true", default=False, help="run integration tests"
+    )
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("--runslow"):
-        # --runslow given in cli: do not skip slow tests
+    if config.getoption("--runslow") and config.getoption("--runintegration"):
         return
-    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
-    for item in items:
-        if "slow" in item.keywords:
-            item.add_marker(skip_slow)
+
+    if not config.getoption("--runslow"):
+        skip_slow = pytest.mark.skip(reason="need --runslow option to run")
+        for item in items:
+            if "slow" in item.keywords:
+                item.add_marker(skip_slow)
+
+    if not config.getoption("--runintegration"):
+        skip_integration = pytest.mark.skip(reason="need --runintegration option to run")
+        for item in items:
+            if "integration" in item.keywords:
+                item.add_marker(skip_integration)
diff --git a/tests/test_model_integration.py b/tests/test_model_integration.py
index 88e3ce14cb..b3587e6f0b 100644
--- a/tests/test_model_integration.py
+++ b/tests/test_model_integration.py
@@ -10,7 +10,7 @@
 from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
 
 
-@pytest.mark.slow
+@pytest.mark.integration
 def test_train_load_use_tagger(results_base_path, tasks_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path)
@@ -27,7 +27,7 @@ def test_train_load_use_tagger(results_base_path, tasks_base_path):
     # initialize trainer
     trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
 
-    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=3)
+    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=2)
 
     loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt')
 
@@ -42,7 +42,7 @@ def test_train_load_use_tagger(results_base_path, tasks_base_path):
     shutil.rmtree(results_base_path)
 
 
-@pytest.mark.slow
+@pytest.mark.integration
 def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path)
@@ -59,7 +59,7 @@ def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path):
     # initialize trainer
     trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
 
-    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=3)
+    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=2)
 
     loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt')
 
@@ -74,7 +74,7 @@ def test_train_charlm_load_use_tagger(results_base_path, tasks_base_path):
     shutil.rmtree(results_base_path)
 
 
-@pytest.mark.slow
+@pytest.mark.integration
 def test_train_charlm_changed_chache_load_use_tagger(results_base_path, tasks_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path)
@@ -94,7 +94,7 @@ def test_train_charlm_changed_chache_load_use_tagger(results_base_path, tasks_ba
     # initialize trainer
     trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
 
-    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=3)
+    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=2)
 
     # remove the cache directory
     shutil.rmtree(cache_dir)
@@ -112,7 +112,7 @@ def test_train_charlm_changed_chache_load_use_tagger(results_base_path, tasks_ba
     shutil.rmtree(results_base_path)
 
 
-@pytest.mark.slow
+@pytest.mark.integration
 def test_train_charlm_nochache_load_use_tagger(results_base_path, tasks_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path)
@@ -129,7 +129,7 @@ def test_train_charlm_nochache_load_use_tagger(results_base_path, tasks_base_pat
     # initialize trainer
     trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)
 
-    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=3)
+    trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=2)
 
     loaded_model: SequenceTagger = SequenceTagger.load_from_file(results_base_path / 'final-model.pt')
 
@@ -144,6 +144,7 @@ def test_train_charlm_nochache_load_use_tagger(results_base_path, tasks_base_pat
     shutil.rmtree(results_base_path)
 
 
+@pytest.mark.integration
 def test_load_use_serialized_tagger():
 
     loaded_model: SequenceTagger = SequenceTagger.load('ner')
@@ -165,7 +166,7 @@ def test_load_use_serialized_tagger():
     loaded_model.predict([sentence_empty])
 
 
-@pytest.mark.slow
+@pytest.mark.integration
 def test_train_load_use_classifier(results_base_path, tasks_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, base_path=tasks_base_path)
@@ -177,7 +178,7 @@ def test_train_load_use_classifier(results_base_path, tasks_base_path):
 
     model = TextClassifier(document_embeddings, label_dict, False)
 
-    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
+    trainer = TextClassifierTrainer(model, corpus, label_dict, test_mode=True)
     trainer.train(str(results_base_path), max_epochs=2)
 
     sentence = Sentence("Berlin is a really nice city.")
@@ -201,7 +202,7 @@ def test_train_load_use_classifier(results_base_path, tasks_base_path):
     shutil.rmtree(results_base_path)
 
 
-@pytest.mark.slow
+@pytest.mark.integration
 def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, base_path=tasks_base_path)
@@ -213,7 +214,7 @@ def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
 
     model = TextClassifier(document_embeddings, label_dict, False)
 
-    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
+    trainer = TextClassifierTrainer(model, corpus, label_dict, test_mode=True)
     trainer.train(str(results_base_path), max_epochs=2)
 
     sentence = Sentence("Berlin is a really nice city.")
@@ -237,7 +238,7 @@ def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
     shutil.rmtree(results_base_path)
 
 
-@pytest.mark.slow
+@pytest.mark.integration
 def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_path):
 
     corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, base_path=tasks_base_path)
@@ -250,7 +251,7 @@ def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_
 
     model = TextClassifier(document_embeddings, label_dict, False)
 
-    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
+    trainer = TextClassifierTrainer(model, corpus, label_dict, test_mode=True)
     trainer.train(str(results_base_path), max_epochs=2)
 
     sentence = Sentence("Berlin is a really nice city.")
@@ -274,7 +275,7 @@ def test_train_charlm_nocache_load_use_classifier(results_base_path, tasks_base_
     shutil.rmtree(results_base_path)
 
 
-@pytest.mark.slow
+@pytest.mark.integration
 def test_train_language_model(results_base_path, resources_path):
     # get default dictionary
     dictionary: Dictionary = Dictionary.load('chars')
@@ -289,8 +290,8 @@ def test_train_language_model(results_base_path, resources_path):
                                     character_level=True)
 
     # train the language model
-    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus)
-    trainer.train(str(results_base_path), sequence_length=10, mini_batch_size=10, max_epochs=5)
+    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
+    trainer.train(str(results_base_path), sequence_length=10, mini_batch_size=10, max_epochs=2)
 
     # use the character LM as embeddings to embed the example sentence 'I love Berlin'
     char_lm_embeddings = CharLMEmbeddings(str(results_base_path / 'best-lm.pt'))

From c8f93d1550959ec8b70a05730a4c7e044de4e9ba Mon Sep 17 00:00:00 2001
From: tabergma <tabergma@gmail.com>
Date: Tue, 6 Nov 2018 13:47:07 +0100
Subject: [PATCH 11/11] GH-177: Run integration test only at PR

---
 .travis.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 099372cca2..573d4a4369 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,4 +6,5 @@ install:
   - pip install -r requirements.txt -q
 before_script: cd tests
 script:
-  - pytest --runintegration
+  - 'if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then pytest --runintegration; fi'
+  - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then pytest; fi'
\ No newline at end of file