From b8dd49bcaec6b3812b6c2ff9c7d03f559aac44b7 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 29 Jun 2024 20:24:47 +0200 Subject: [PATCH 01/67] Moving LightFM to extras Signed-off-by: miguelgfierro --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 87923cf00..b57ad28f2 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,6 @@ "category-encoders>=2.6.0,<3", # requires packaging "cornac>=1.15.2,<2", # requires packaging, tqdm "hyperopt>=0.2.7,<1", - "lightfm>=1.17,<2", # requires requests "lightgbm>=4.0.0,<5", "locust>=2.12.2,<3", # requires jinja2 "memory-profiler>=0.61.0,<1", @@ -80,6 +79,7 @@ # nni needs to be upgraded "nni==1.5", "pymanopt>=0.2.5", + "lightfm>=1.17,<2", ] # The following dependency can be installed as below, however PyPI does not allow direct URLs. From c2e9572f68ec69975fb076112dc3d59497c7baf3 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 29 Jun 2024 20:36:35 +0200 Subject: [PATCH 02/67] move lightfm tests to experimental Signed-off-by: miguelgfierro --- tests/ci/azureml_tests/test_groups.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/ci/azureml_tests/test_groups.py b/tests/ci/azureml_tests/test_groups.py index f05e27a9f..2a262c12d 100644 --- a/tests/ci/azureml_tests/test_groups.py +++ b/tests/ci/azureml_tests/test_groups.py @@ -47,8 +47,6 @@ "tests/functional/examples/test_notebooks_python.py::test_geoimc_functional", # 1006.19s # "tests/functional/examples/test_notebooks_python.py::test_benchmark_movielens_cpu", # 58s - # - "tests/functional/examples/test_notebooks_python.py::test_lightfm_functional", ], "group_cpu_003": [ # Total group time: 2253s "tests/data_validation/recommenders/datasets/test_criteo.py::test_download_criteo_sample", # 1.05s @@ -237,10 +235,6 @@ "tests/unit/recommenders/models/test_geoimc.py::test_imcproblem", "tests/unit/recommenders/models/test_geoimc.py::test_inferer_init", "tests/unit/recommenders/models/test_geoimc.py::test_inferer_infer", - "tests/unit/recommenders/models/test_lightfm_utils.py::test_interactions", - "tests/unit/recommenders/models/test_lightfm_utils.py::test_fitting", - "tests/unit/recommenders/models/test_lightfm_utils.py::test_sim_users", - "tests/unit/recommenders/models/test_lightfm_utils.py::test_sim_items", "tests/unit/recommenders/models/test_sar_singlenode.py::test_init", "tests/unit/recommenders/models/test_sar_singlenode.py::test_fit", "tests/unit/recommenders/models/test_sar_singlenode.py::test_predict", @@ -453,3 +447,14 @@ "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm", ], } + +# Experimental are additional test groups that require to install extra dependencies: pip install .[experimental] +experimental_test_groups = { + "group_cpu_001": [ + "tests/unit/recommenders/models/test_lightfm_utils.py::test_interactions", + "tests/unit/recommenders/models/test_lightfm_utils.py::test_fitting", + "tests/unit/recommenders/models/test_lightfm_utils.py::test_sim_users", + "tests/unit/recommenders/models/test_lightfm_utils.py::test_sim_items", + "tests/functional/examples/test_notebooks_python.py::test_lightfm_functional", + ] +} From fe1379027046eb47863fe1f298abb09d32261be7 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 29 Jun 2024 20:41:25 +0200 Subject: [PATCH 03/67] Note in notebook Signed-off-by: miguelgfierro --- .../02_model_collaborative_filtering/lightfm_deep_dive.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/02_model_collaborative_filtering/lightfm_deep_dive.ipynb b/examples/02_model_collaborative_filtering/lightfm_deep_dive.ipynb index 8e588760f..5a60091d7 100755 --- a/examples/02_model_collaborative_filtering/lightfm_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/lightfm_deep_dive.ipynb @@ -22,6 +22,8 @@ "source": [ "This notebook explains the concept of a Factorization Machine based model for recommendation, it also outlines the steps to construct a pure matrix factorization and a Factorization Machine using the [LightFM](https://github.com/lyst/lightfm) package. It also demonstrates how to extract both user and item affinity from a fitted model.\n", "\n", + "*NOTE: LightFM is not available in the core package of Recommenders, to run this notebook, install the experimental package with `pip install recommenders[experimental]`.*\n", + "\n", "## 1. Factorization Machine model\n", "\n", "### 1.1 Background\n", From cf64eed0a2e3eb93ab14e8f20a72d2d04f36c7bb Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 29 Jun 2024 22:21:02 +0200 Subject: [PATCH 04/67] Deprecation of SchemaModel in Pandera Signed-off-by: miguelgfierro --- recommenders/datasets/movielens.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index c0b3b5f72..a8a8b4441 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -582,7 +582,7 @@ def unique_columns(df, *, columns): return not df[columns].duplicated().any() -class MockMovielensSchema(pa.SchemaModel): +class MockMovielensSchema(pa.DataFrameModel): """ Mock dataset schema to generate fake data for testing purpose. This schema is configured to mimic the Movielens dataset From 15317261aab06aca08f7e9af6409b03d5d4a132a Mon Sep 17 00:00:00 2001 From: sumana sree Date: Wed, 3 Jul 2024 13:54:02 +0530 Subject: [PATCH 05/67] Updated Issue_Templates with the sub heading of willingness to contribute Signed-off-by: sumana sree --- .github/ISSUE_TEMPLATE.md | 5 +++++ .github/ISSUE_TEMPLATE/bug_report.md | 5 +++++ .github/ISSUE_TEMPLATE/feature_request.md | 5 +++++ .github/ISSUE_TEMPLATE/general-ask.md | 5 +++++ 4 files changed, 20 insertions(+) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7f7f33816..ba0644977 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -22,5 +22,10 @@ +### Willingness to contribute +- [ ] Yes, I can contribute for this issue independently. +- [ ] Yes, I can contribute for this issue with guidance from Recommenders community. +- [ ] No, I cannot contribute at this time. + ### Other Comments diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 48fde3bbe..0f4833d20 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -28,4 +28,9 @@ assignees: '' +### Willingness to contribute +- [ ] Yes, I can contribute for this issue independently. +- [ ] Yes, I can contribute for this issue with guidance from Recommenders community. +- [ ] No, I cannot contribute at this time. + ### Other Comments diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 40080a5a2..b6588a699 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -14,4 +14,9 @@ assignees: '' +### Willingness to contribute +- [ ] Yes, I can contribute for this issue independently. +- [ ] Yes, I can contribute for this issue with guidance from Recommenders community. +- [ ] No, I cannot contribute at this time. + ### Other Comments diff --git a/.github/ISSUE_TEMPLATE/general-ask.md b/.github/ISSUE_TEMPLATE/general-ask.md index 76442059b..138f79ba3 100644 --- a/.github/ISSUE_TEMPLATE/general-ask.md +++ b/.github/ISSUE_TEMPLATE/general-ask.md @@ -10,4 +10,9 @@ assignees: '' ### Description +### Willingness to contribute +- [ ] Yes, I can contribute for this issue independently. +- [ ] Yes, I can contribute for this issue with guidance from Recommenders community. +- [ ] No, I cannot contribute at this time. + ### Other Comments From f93aace968560ffb43d7669486e00bcf22e0dd11 Mon Sep 17 00:00:00 2001 From: sumana sree Date: Wed, 3 Jul 2024 14:05:59 +0530 Subject: [PATCH 06/67] Added comment under willing to contribute Signed-off-by: sumana sree --- .github/ISSUE_TEMPLATE.md | 2 ++ .github/ISSUE_TEMPLATE/bug_report.md | 1 + .github/ISSUE_TEMPLATE/feature_request.md | 1 + .github/ISSUE_TEMPLATE/general-ask.md | 1 + 4 files changed, 5 insertions(+) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ba0644977..9216fade1 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -22,7 +22,9 @@ + ### Willingness to contribute + - [ ] Yes, I can contribute for this issue independently. - [ ] Yes, I can contribute for this issue with guidance from Recommenders community. - [ ] No, I cannot contribute at this time. diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 0f4833d20..eb49cad11 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -29,6 +29,7 @@ assignees: '' ### Willingness to contribute + - [ ] Yes, I can contribute for this issue independently. - [ ] Yes, I can contribute for this issue with guidance from Recommenders community. - [ ] No, I cannot contribute at this time. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index b6588a699..e192f6320 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -15,6 +15,7 @@ assignees: '' ### Willingness to contribute + - [ ] Yes, I can contribute for this issue independently. - [ ] Yes, I can contribute for this issue with guidance from Recommenders community. - [ ] No, I cannot contribute at this time. diff --git a/.github/ISSUE_TEMPLATE/general-ask.md b/.github/ISSUE_TEMPLATE/general-ask.md index 138f79ba3..f8ea3d49f 100644 --- a/.github/ISSUE_TEMPLATE/general-ask.md +++ b/.github/ISSUE_TEMPLATE/general-ask.md @@ -11,6 +11,7 @@ assignees: '' ### Willingness to contribute + - [ ] Yes, I can contribute for this issue independently. - [ ] Yes, I can contribute for this issue with guidance from Recommenders community. - [ ] No, I cannot contribute at this time. From 49836f313bc4638e4a0af76a3e53024e8c64a00e Mon Sep 17 00:00:00 2001 From: sumana sree Date: Wed, 3 Jul 2024 14:09:19 +0530 Subject: [PATCH 07/67] updated comment under willingness to contribute Signed-off-by: sumana sree --- .github/ISSUE_TEMPLATE.md | 2 +- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- .github/ISSUE_TEMPLATE/feature_request.md | 2 +- .github/ISSUE_TEMPLATE/general-ask.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 9216fade1..0ee0d676b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -24,7 +24,7 @@ ### Willingness to contribute - + - [ ] Yes, I can contribute for this issue independently. - [ ] Yes, I can contribute for this issue with guidance from Recommenders community. - [ ] No, I cannot contribute at this time. diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index eb49cad11..733808969 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -29,7 +29,7 @@ assignees: '' ### Willingness to contribute - + - [ ] Yes, I can contribute for this issue independently. - [ ] Yes, I can contribute for this issue with guidance from Recommenders community. - [ ] No, I cannot contribute at this time. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index e192f6320..055d4a3fc 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -15,7 +15,7 @@ assignees: '' ### Willingness to contribute - + - [ ] Yes, I can contribute for this issue independently. - [ ] Yes, I can contribute for this issue with guidance from Recommenders community. - [ ] No, I cannot contribute at this time. diff --git a/.github/ISSUE_TEMPLATE/general-ask.md b/.github/ISSUE_TEMPLATE/general-ask.md index f8ea3d49f..b89ac7595 100644 --- a/.github/ISSUE_TEMPLATE/general-ask.md +++ b/.github/ISSUE_TEMPLATE/general-ask.md @@ -11,7 +11,7 @@ assignees: '' ### Willingness to contribute - + - [ ] Yes, I can contribute for this issue independently. - [ ] Yes, I can contribute for this issue with guidance from Recommenders community. - [ ] No, I cannot contribute at this time. From 891fc8534969f68a45fd1708e3434fe41aa5c676 Mon Sep 17 00:00:00 2001 From: sumana sree Date: Thu, 4 Jul 2024 23:12:40 +0530 Subject: [PATCH 08/67] modified 2 files to update newsrec model Signed-off-by: sumana sree --- recommenders/models/newsrec/models/base_model.py | 4 ++++ .../smoke/recommenders/recommender/test_newsrec_model.py | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/recommenders/models/newsrec/models/base_model.py b/recommenders/models/newsrec/models/base_model.py index e0f4da017..1e2f2cdd4 100644 --- a/recommenders/models/newsrec/models/base_model.py +++ b/recommenders/models/newsrec/models/base_model.py @@ -186,6 +186,8 @@ def fit( valid_behaviors_file, test_news_file=None, test_behaviors_file=None, + steps=None, + ): """Fit the model with train_file. Evaluate the model on valid_file per epoch to observe the training status. If test_news_file is not None, evaluate it too. @@ -212,6 +214,8 @@ def fit( ) for batch_data_input in tqdm_util: + if steps is not None and steps>=steps: + break step_result = self.train(batch_data_input) step_data_loss = step_result diff --git a/tests/smoke/recommenders/recommender/test_newsrec_model.py b/tests/smoke/recommenders/recommender/test_newsrec_model.py index 7cad05ba3..8abd2d433 100644 --- a/tests/smoke/recommenders/recommender/test_newsrec_model.py +++ b/tests/smoke/recommenders/recommender/test_newsrec_model.py @@ -62,7 +62,7 @@ def test_model_nrms(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,steps=1 ), BaseModel, ) @@ -115,7 +115,7 @@ def test_model_naml(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,steps=1 ), BaseModel, ) @@ -166,7 +166,7 @@ def test_model_lstur(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,steps=1 ), BaseModel, ) @@ -217,7 +217,7 @@ def test_model_npa(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,steps=1 ), BaseModel, ) From 7c9a6931031d78c5f956d4bcb13f3156dbaa8989 Mon Sep 17 00:00:00 2001 From: sumana sree Date: Fri, 5 Jul 2024 19:22:43 +0530 Subject: [PATCH 09/67] changed the parameter name from steps to step_limit Signed-off-by: sumana sree --- recommenders/models/newsrec/models/base_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recommenders/models/newsrec/models/base_model.py b/recommenders/models/newsrec/models/base_model.py index 1e2f2cdd4..c19679c95 100644 --- a/recommenders/models/newsrec/models/base_model.py +++ b/recommenders/models/newsrec/models/base_model.py @@ -186,7 +186,7 @@ def fit( valid_behaviors_file, test_news_file=None, test_behaviors_file=None, - steps=None, + step_limit=None, ): """Fit the model with train_file. Evaluate the model on valid_file per epoch to observe the training status. @@ -214,7 +214,7 @@ def fit( ) for batch_data_input in tqdm_util: - if steps is not None and steps>=steps: + if step_limit is not None and step_limit>=step_limit: break step_result = self.train(batch_data_input) From 2a9604e49663773b2c2b8239edf7f627614e5100 Mon Sep 17 00:00:00 2001 From: sumana sree Date: Fri, 5 Jul 2024 19:26:09 +0530 Subject: [PATCH 10/67] changed parameter name to step_limit from steps Signed-off-by: sumana sree --- .../smoke/recommenders/recommender/test_newsrec_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/smoke/recommenders/recommender/test_newsrec_model.py b/tests/smoke/recommenders/recommender/test_newsrec_model.py index 8abd2d433..f52f20cdd 100644 --- a/tests/smoke/recommenders/recommender/test_newsrec_model.py +++ b/tests/smoke/recommenders/recommender/test_newsrec_model.py @@ -62,7 +62,7 @@ def test_model_nrms(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,steps=1 + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=1 ), BaseModel, ) @@ -115,7 +115,7 @@ def test_model_naml(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,steps=1 + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=1 ), BaseModel, ) @@ -166,7 +166,7 @@ def test_model_lstur(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,steps=1 + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=1 ), BaseModel, ) @@ -217,7 +217,7 @@ def test_model_npa(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,steps=1 + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=1 ), BaseModel, ) From 650ad365748b3be297c4a9a14d6c21c2b1c63034 Mon Sep 17 00:00:00 2001 From: Sumana Sree <110307215+sumana-2705@users.noreply.github.com> Date: Sat, 6 Jul 2024 09:07:57 +0530 Subject: [PATCH 11/67] Update recommenders/models/newsrec/models/base_model.py Co-authored-by: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> --- recommenders/models/newsrec/models/base_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommenders/models/newsrec/models/base_model.py b/recommenders/models/newsrec/models/base_model.py index c19679c95..344ea8f7c 100644 --- a/recommenders/models/newsrec/models/base_model.py +++ b/recommenders/models/newsrec/models/base_model.py @@ -214,7 +214,7 @@ def fit( ) for batch_data_input in tqdm_util: - if step_limit is not None and step_limit>=step_limit: + if step_limit is not None and step>=step_limit: break step_result = self.train(batch_data_input) From 6cb04d3a7a82cb4ddc31945abb2bc9e9b83e411d Mon Sep 17 00:00:00 2001 From: sumana sree Date: Sat, 6 Jul 2024 09:15:07 +0530 Subject: [PATCH 12/67] corrected mistake Signed-off-by: sumana sree --- recommenders/models/newsrec/models/base_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommenders/models/newsrec/models/base_model.py b/recommenders/models/newsrec/models/base_model.py index c19679c95..344ea8f7c 100644 --- a/recommenders/models/newsrec/models/base_model.py +++ b/recommenders/models/newsrec/models/base_model.py @@ -214,7 +214,7 @@ def fit( ) for batch_data_input in tqdm_util: - if step_limit is not None and step_limit>=step_limit: + if step_limit is not None and step>=step_limit: break step_result = self.train(batch_data_input) From 3fbbefe62e8895e2f829d91baf5eedaedab8de5b Mon Sep 17 00:00:00 2001 From: sumana sree Date: Sun, 7 Jul 2024 14:12:57 +0530 Subject: [PATCH 13/67] set step_limit = 10 Signed-off-by: sumana sree --- .../smoke/recommenders/recommender/test_newsrec_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/smoke/recommenders/recommender/test_newsrec_model.py b/tests/smoke/recommenders/recommender/test_newsrec_model.py index f52f20cdd..db609f098 100644 --- a/tests/smoke/recommenders/recommender/test_newsrec_model.py +++ b/tests/smoke/recommenders/recommender/test_newsrec_model.py @@ -62,7 +62,7 @@ def test_model_nrms(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=1 + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=10 ), BaseModel, ) @@ -115,7 +115,7 @@ def test_model_naml(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=1 + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=10 ), BaseModel, ) @@ -166,7 +166,7 @@ def test_model_lstur(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=1 + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=10 ), BaseModel, ) @@ -217,7 +217,7 @@ def test_model_npa(mind_resource_path): assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit( - train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=1 + train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file,step_limit=10 ), BaseModel, ) From 3672c2ebac8277ac3867645dfc158a08cdfb0b22 Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Tue, 9 Jul 2024 16:08:55 +0800 Subject: [PATCH 14/67] Set scipy <= 1.13.1 (#2127) * Set scipy<=1.13.1 Signed-off-by: Simon Zhao * Leave a comment as a reminder Signed-off-by: Simon Zhao --------- Signed-off-by: Simon Zhao --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b57ad28f2..631d6cd83 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ install_requires = [ "category-encoders>=2.6.0,<3", # requires packaging - "cornac>=1.15.2,<2", # requires packaging, tqdm + "cornac>=1.15.2,<3", # requires packaging, tqdm "hyperopt>=0.2.7,<1", "lightgbm>=4.0.0,<5", "locust>=2.12.2,<3", # requires jinja2 @@ -42,7 +42,7 @@ "retrying>=1.3.4,<2", "scikit-learn>=1.2.0,<2", # requires scipy, and introduce breaking change affects feature_extraction.text.TfidfVectorizer.min_df "scikit-surprise>=1.1.3", - "scipy>=1.10.1", + "scipy>=1.10.1,<=1.13.1", # FIXME: Remove scipy<=1.13.1 once cornac release a version newer than 2.2.1. See #2128 "seaborn>=0.13.0,<1", # requires matplotlib, packaging "transformers>=4.27.0,<5", # requires packaging, pyyaml, requests, tqdm ] From 4654367941380da7a77c1edf8c3bf7be407a3b69 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 13 Jul 2024 09:33:19 +0200 Subject: [PATCH 15/67] advice tests Signed-off-by: miguelgfierro --- tests/README.md | 11 +++++------ tests/unit/recommenders/utils/test_gpu_utils.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/README.md b/tests/README.md index a6068daec..949e78df8 100644 --- a/tests/README.md +++ b/tests/README.md @@ -74,15 +74,10 @@ In this section we show how to create tests and add them to the test pipeline. T ### How to create tests for the Recommenders library -You want to make sure that all your code works before you submit it to the repository. Here are some guidelines for creating the unit tests: +You want to make sure that all your code works before you submit it to the repository. Here are some guidelines for creating the tests: * It is better to create multiple small tests than one large test that checks all the code. * Use `@pytest.fixture` to create data in your tests. -* Use the mark `@pytest.mark.gpu` if you want the test to be executed - in a GPU environment. Use `@pytest.mark.spark` if you want the test - to be executed in a Spark environment. -* Use `@pytest.mark.notebooks` if you are testing a notebook. -* Avoid using `is` in the asserts, instead use the operator `==`. * Follow the pattern `assert computation == value`, for example: ```python assert results["precision"] == pytest.approx(0.330753) @@ -92,6 +87,10 @@ assert results["precision"] == pytest.approx(0.330753) assert rmse(rating_true, rating_true) == 0 assert rmse(rating_true, rating_pred) == pytest.approx(7.254309) ``` +* Use the operator `==` with values. Use the operator `is` in singletons like `None`, `True` or `False`. +* Use the mark `@pytest.mark.gpu` if you want the test to be executed in a GPU environment. Use `@pytest.mark.spark` if you want the test to be executed in a Spark environment. +* Use `@pytest.mark.notebooks` if you are testing a notebook. + ### How to create tests for the notebooks diff --git a/tests/unit/recommenders/utils/test_gpu_utils.py b/tests/unit/recommenders/utils/test_gpu_utils.py index 7cbe9b287..be4c8b89c 100644 --- a/tests/unit/recommenders/utils/test_gpu_utils.py +++ b/tests/unit/recommenders/utils/test_gpu_utils.py @@ -47,7 +47,7 @@ def test_get_cudnn_version(): @pytest.mark.gpu def test_cudnn_enabled(): - assert torch.backends.cudnn.enabled == True + assert torch.backends.cudnn.enabled is True @pytest.mark.gpu From 9d449f29c8c4b47cb0161d37a69b995496fd8142 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 13 Jul 2024 09:41:28 +0200 Subject: [PATCH 16/67] test intro Signed-off-by: miguelgfierro --- tests/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/README.md b/tests/README.md index 949e78df8..af8f466be 100644 --- a/tests/README.md +++ b/tests/README.md @@ -5,6 +5,12 @@ Licensed under the MIT License. # Tests +Recommenders test pipeline is one of the most sophisticated MLOps pipelines in the open-source community. We execute tests in the three environments we support: CPU, GPU, and Spark, mirroring the tests in each Python version we support. We not only tests the library, but also the Jupyter notebooks in the examples folder. + +The reason to have this extensive test infrastructure is to ensure that the code is reproducible by the community and that we can maintain the project with a small number of core contributors. + +We currently execute over a thousand tests in the project, and we are always looking for ways to improve the test coverage. To get the exact number of tests, you can run `pytest tests --collect-only`, and then multiply the number of tests by the number of Python versions we support. + In this document we show our test infrastructure and how to contribute tests to the repository. ## Table of Contents From d5b8d7e3e95736cdc2ffb902920e2495678cd46c Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 13 Jul 2024 09:44:15 +0200 Subject: [PATCH 17/67] Review data validation Signed-off-by: miguelgfierro --- tests/data_validation/examples/test_wikidata.py | 2 -- tests/data_validation/recommenders/datasets/test_movielens.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/data_validation/examples/test_wikidata.py b/tests/data_validation/examples/test_wikidata.py index 65c676f67..7c8a1d502 100644 --- a/tests/data_validation/examples/test_wikidata.py +++ b/tests/data_validation/examples/test_wikidata.py @@ -8,7 +8,6 @@ @pytest.mark.notebooks -# @pytest.mark.skip(reason="Wikidata API is unstable") def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): notebook_path = notebooks["wikidata_knowledge_graph"] MOVIELENS_SAMPLE_SIZE = 5 @@ -25,7 +24,6 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): @pytest.mark.notebooks -# @pytest.mark.skip(reason="Wikidata API is unstable") def test_wikidata_values(notebooks, output_notebook, kernel_name): notebook_path = notebooks["wikidata_knowledge_graph"] execute_notebook( diff --git a/tests/data_validation/recommenders/datasets/test_movielens.py b/tests/data_validation/recommenders/datasets/test_movielens.py index 5af7e9673..947e7629b 100644 --- a/tests/data_validation/recommenders/datasets/test_movielens.py +++ b/tests/data_validation/recommenders/datasets/test_movielens.py @@ -126,8 +126,8 @@ def test_download_and_extract_movielens(size, tmp): ) # Test if raw-zip file, rating file, and item file are cached assert len(os.listdir(tmp)) == 3 - assert os.path.exists(rating_path) - assert os.path.exists(item_path) + assert os.path.exists(rating_path) is True + assert os.path.exists(item_path) is True @pytest.mark.parametrize( From 17e8c8c8e44fbb0e6d585516be219652c3069f8b Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 13 Jul 2024 09:53:09 +0200 Subject: [PATCH 18/67] smoke refact Signed-off-by: miguelgfierro --- tests/README.md | 1 + tests/ci/azureml_tests/test_groups.py | 26 +++++++++---------- .../datasets/test_criteo_privacy.py | 2 +- .../datasets/test_movielens_privacy.py | 3 ++- .../{recommender => models}/__init__.py | 0 .../test_deeprec_model.py | 0 .../test_deeprec_utils.py | 0 .../test_newsrec_model.py | 0 .../test_newsrec_utils.py | 0 9 files changed, 17 insertions(+), 15 deletions(-) rename tests/smoke/recommenders/{recommender => models}/__init__.py (100%) rename tests/smoke/recommenders/{recommender => models}/test_deeprec_model.py (100%) rename tests/smoke/recommenders/{recommender => models}/test_deeprec_utils.py (100%) rename tests/smoke/recommenders/{recommender => models}/test_newsrec_model.py (100%) rename tests/smoke/recommenders/{recommender => models}/test_newsrec_utils.py (100%) diff --git a/tests/README.md b/tests/README.md index af8f466be..df8e3e96d 100644 --- a/tests/README.md +++ b/tests/README.md @@ -94,6 +94,7 @@ assert rmse(rating_true, rating_true) == 0 assert rmse(rating_true, rating_pred) == pytest.approx(7.254309) ``` * Use the operator `==` with values. Use the operator `is` in singletons like `None`, `True` or `False`. +* Make explicit asserts. In other words, make sure you assert to something (`assert computation == value`) and not just `assert computation`. * Use the mark `@pytest.mark.gpu` if you want the test to be executed in a GPU environment. Use `@pytest.mark.spark` if you want the test to be executed in a Spark environment. * Use `@pytest.mark.notebooks` if you are testing a notebook. diff --git a/tests/ci/azureml_tests/test_groups.py b/tests/ci/azureml_tests/test_groups.py index 2a262c12d..aa2d78b6e 100644 --- a/tests/ci/azureml_tests/test_groups.py +++ b/tests/ci/azureml_tests/test_groups.py @@ -63,24 +63,24 @@ ], "group_gpu_001": [ # Total group time: 1937.01s "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm", # 0.76s (Always the first test to check the GPU works) - "tests/smoke/recommenders/recommender/test_deeprec_utils.py", # 2.91 - "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_FFM_iterator", # 0.74s - "tests/smoke/recommenders/recommender/test_newsrec_utils.py::test_news_iterator", # 3.04s + "tests/smoke/recommenders/models/test_deeprec_utils.py", # 2.91 + "tests/smoke/recommenders/models/test_deeprec_model.py::test_FFM_iterator", # 0.74s + "tests/smoke/recommenders/models/test_newsrec_utils.py::test_news_iterator", # 3.04s # - "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_lightgcn", # 6.03s + "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_lightgcn", # 6.03s "tests/functional/examples/test_notebooks_gpu.py::test_lightgcn_deep_dive_functional", # 19.45s # - # "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_sum", # 27.23s # FIXME: Disabled due to the issue with TF version > 2.10.1 See #2018 + # "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_sum", # 27.23s # FIXME: Disabled due to the issue with TF version > 2.10.1 See #2018 # - "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_dkn", # 187.20s + "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_dkn", # 187.20s "tests/functional/examples/test_notebooks_gpu.py::test_dkn_quickstart_functional", # 1167.93s # "tests/functional/examples/test_notebooks_gpu.py::test_slirec_quickstart_functional", # 175.00s - "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_slirec", # 346.72s + "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_slirec", # 346.72s ], "group_gpu_002": [ # Total group time: 1896.76s "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm", # 0.76s (Always the first test to check the GPU works) - "tests/smoke/recommenders/recommender/test_deeprec_model.py::test_model_xdeepfm", # 3.10s + "tests/smoke/recommenders/models/test_deeprec_model.py::test_model_xdeepfm", # 3.10s # FIXME: https://github.com/microsoft/recommenders/issues/1883 # "tests/smoke/examples/test_notebooks_gpu.py::test_xdeepfm_smoke", # 77.93s "tests/functional/examples/test_notebooks_gpu.py::test_xdeepfm_functional", @@ -100,9 +100,9 @@ "tests/smoke/examples/test_notebooks_gpu.py::test_ncf_deep_dive_smoke", # 102.71s "tests/functional/examples/test_notebooks_gpu.py::test_ncf_deep_dive_functional", # 351.17s # - "tests/smoke/recommenders/recommender/test_newsrec_utils.py::test_naml_iterator", # 5.50s + "tests/smoke/recommenders/models/test_newsrec_utils.py::test_naml_iterator", # 5.50s # FIXME: https://github.com/microsoft/recommenders/issues/1883 - # "tests/smoke/recommenders/recommender/test_newsrec_model.py::test_model_naml", # 450.65s + # "tests/smoke/recommenders/models/test_newsrec_model.py::test_model_naml", # 450.65s ], "group_gpu_004": [ # Total group time: 2103.34s "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm", # 0.76s (Always the first test to check the GPU works) @@ -125,8 +125,8 @@ ], "group_gpu_006": [ # Total group time: 1763.99s "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm", # 0.76s (Always the first test to check the GPU works) - "tests/smoke/recommenders/recommender/test_newsrec_model.py::test_model_npa", # 202.61s - "tests/smoke/recommenders/recommender/test_newsrec_model.py::test_model_nrms", # 188.60s + "tests/smoke/recommenders/models/test_newsrec_model.py::test_model_npa", # 202.61s + "tests/smoke/recommenders/models/test_newsrec_model.py::test_model_nrms", # 188.60s ], "group_gpu_007": [ # Total group time: 846.89s "tests/unit/examples/test_notebooks_gpu.py::test_gpu_vm", # 0.76s (Always the first test to check the GPU works) @@ -138,7 +138,7 @@ # "tests/functional/examples/test_notebooks_gpu.py::test_naml_quickstart_functional", # 2033.85s # FIXME: https://github.com/microsoft/recommenders/issues/1716 # "tests/functional/examples/test_notebooks_gpu.py::test_sasrec_quickstart_functional", # 448.06s + 614.69s - "tests/smoke/recommenders/recommender/test_newsrec_model.py::test_model_lstur", # 194.88s + "tests/smoke/recommenders/models/test_newsrec_model.py::test_model_lstur", # 194.88s ], "group_spark_001": [ # Total group time: 987.16s "tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df", # 4.33s+ 25.58s + 101.99s + 139.23s diff --git a/tests/responsible_ai/recommenders/datasets/test_criteo_privacy.py b/tests/responsible_ai/recommenders/datasets/test_criteo_privacy.py index 61dfc2295..d932b59ce 100644 --- a/tests/responsible_ai/recommenders/datasets/test_criteo_privacy.py +++ b/tests/responsible_ai/recommenders/datasets/test_criteo_privacy.py @@ -12,4 +12,4 @@ def test_criteo_privacy(criteo_first_row): data is anonymized. """ df = criteo.load_pandas_df(size="sample") - assert df.loc[0].equals(pd.Series(criteo_first_row)) + assert df.loc[0].equals(pd.Series(criteo_first_row)) is True diff --git a/tests/responsible_ai/recommenders/datasets/test_movielens_privacy.py b/tests/responsible_ai/recommenders/datasets/test_movielens_privacy.py index dd6a16ccc..ba75711fd 100644 --- a/tests/responsible_ai/recommenders/datasets/test_movielens_privacy.py +++ b/tests/responsible_ai/recommenders/datasets/test_movielens_privacy.py @@ -11,4 +11,5 @@ def test_movielens_privacy(): """ df = movielens.load_pandas_df(size="100k") users = df["userID"].values.tolist() - assert all(isinstance(x, int) for x in users) + + assert all(isinstance(x, int) for x in users) is True diff --git a/tests/smoke/recommenders/recommender/__init__.py b/tests/smoke/recommenders/models/__init__.py similarity index 100% rename from tests/smoke/recommenders/recommender/__init__.py rename to tests/smoke/recommenders/models/__init__.py diff --git a/tests/smoke/recommenders/recommender/test_deeprec_model.py b/tests/smoke/recommenders/models/test_deeprec_model.py similarity index 100% rename from tests/smoke/recommenders/recommender/test_deeprec_model.py rename to tests/smoke/recommenders/models/test_deeprec_model.py diff --git a/tests/smoke/recommenders/recommender/test_deeprec_utils.py b/tests/smoke/recommenders/models/test_deeprec_utils.py similarity index 100% rename from tests/smoke/recommenders/recommender/test_deeprec_utils.py rename to tests/smoke/recommenders/models/test_deeprec_utils.py diff --git a/tests/smoke/recommenders/recommender/test_newsrec_model.py b/tests/smoke/recommenders/models/test_newsrec_model.py similarity index 100% rename from tests/smoke/recommenders/recommender/test_newsrec_model.py rename to tests/smoke/recommenders/models/test_newsrec_model.py diff --git a/tests/smoke/recommenders/recommender/test_newsrec_utils.py b/tests/smoke/recommenders/models/test_newsrec_utils.py similarity index 100% rename from tests/smoke/recommenders/recommender/test_newsrec_utils.py rename to tests/smoke/recommenders/models/test_newsrec_utils.py From 287405bc77aaf7e8a710db717e2011e7015e7fd3 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 13 Jul 2024 10:02:58 +0200 Subject: [PATCH 19/67] unit Signed-off-by: miguelgfierro --- .../datasets/test_download_utils.py | 12 ++++---- .../datasets/test_pandas_df_utils.py | 22 +++++++------- .../datasets/test_spark_splitter.py | 10 +++---- .../evaluation/test_python_evaluation.py | 30 +++++++++---------- .../recommenders/models/test_deeprec_utils.py | 5 ++-- .../models/test_ncf_singlenode.py | 12 ++++---- .../models/test_sar_singlenode.py | 8 ++--- .../recommenders/models/test_vowpal_wabbit.py | 4 +-- 8 files changed, 52 insertions(+), 51 deletions(-) diff --git a/tests/unit/recommenders/datasets/test_download_utils.py b/tests/unit/recommenders/datasets/test_download_utils.py index c7d649796..305dc1ecb 100644 --- a/tests/unit/recommenders/datasets/test_download_utils.py +++ b/tests/unit/recommenders/datasets/test_download_utils.py @@ -25,7 +25,7 @@ def test_maybe_download(files_fixtures): os.remove(filepath) downloaded_filepath = maybe_download(file_url, "license.txt", expected_bytes=1212) - assert os.path.exists(downloaded_filepath) + assert os.path.exists(downloaded_filepath) is True assert os.path.basename(downloaded_filepath) == "license.txt" @@ -51,7 +51,7 @@ def test_maybe_download_maybe(caplog, files_fixtures): os.remove(filepath) downloaded_filepath = maybe_download(file_url, "license.txt") - assert os.path.exists(downloaded_filepath) + assert os.path.exists(downloaded_filepath) is True maybe_download(file_url, "license.txt") assert "File ." + os.path.sep + "license.txt already downloaded" in caplog.text @@ -69,11 +69,11 @@ def test_maybe_download_retry(caplog): def test_download_path(): # Check that the temporal path is created and deleted with download_path() as path: - assert os.path.isdir(path) - assert not os.path.isdir(path) + assert os.path.isdir(path) is True + assert os.path.isdir(path) is False # Check the behavior when a path is provided tmp_dir = TemporaryDirectory() with download_path(tmp_dir.name) as path: - assert os.path.isdir(path) - assert os.path.isdir(path) + assert os.path.isdir(path) is True + assert os.path.isdir(path) is False diff --git a/tests/unit/recommenders/datasets/test_pandas_df_utils.py b/tests/unit/recommenders/datasets/test_pandas_df_utils.py index 7fe502d18..f6eb98727 100644 --- a/tests/unit/recommenders/datasets/test_pandas_df_utils.py +++ b/tests/unit/recommenders/datasets/test_pandas_df_utils.py @@ -235,10 +235,10 @@ def test_has_columns(): df_1 = pd.DataFrame(dict(a=[1, 2, 3])) df_2 = pd.DataFrame(dict(b=[7, 8, 9], a=[1, 2, 3])) - assert has_columns(df_1, ["a"]) - assert has_columns(df_2, ["a"]) - assert has_columns(df_2, ["a", "b"]) - assert not has_columns(df_2, ["a", "b", "c"]) + assert has_columns(df_1, ["a"]) is True + assert has_columns(df_2, ["a"]) is True + assert has_columns(df_2, ["a", "b"]) is True + assert has_columns(df_2, ["a", "b", "c"]) is False def test_has_same_base_dtype(): @@ -256,19 +256,19 @@ def test_has_same_base_dtype(): df_6 = pd.DataFrame(dict(a=arr_str)) # all columns match - assert has_same_base_dtype(df_1, df_2) + assert has_same_base_dtype(df_1, df_2) is True # specific column matches - assert has_same_base_dtype(df_3, df_4, columns=["a"]) + assert has_same_base_dtype(df_3, df_4, columns=["a"]) is True # some column types do not match - assert not has_same_base_dtype(df_3, df_4) + assert has_same_base_dtype(df_3, df_4) is False # column types do not match - assert not has_same_base_dtype(df_1, df_3, columns=["a"]) + assert has_same_base_dtype(df_1, df_3, columns=["a"]) is False # all columns are not shared - assert not has_same_base_dtype(df_4, df_5) + assert has_same_base_dtype(df_4, df_5) is False # column types do not match - assert not has_same_base_dtype(df_5, df_6, columns=["a"]) + assert has_same_base_dtype(df_5, df_6, columns=["a"]) is False # assert string columns match - assert has_same_base_dtype(df_6, df_6) + assert has_same_base_dtype(df_6, df_6) is True def test_lru_cache_df(): diff --git a/tests/unit/recommenders/datasets/test_spark_splitter.py b/tests/unit/recommenders/datasets/test_spark_splitter.py index 9f6d40254..62e5c2c77 100644 --- a/tests/unit/recommenders/datasets/test_spark_splitter.py +++ b/tests/unit/recommenders/datasets/test_spark_splitter.py @@ -80,8 +80,8 @@ def test_min_rating_filter(spark_dataset): x["count"] >= 5 for x in dfs_item.groupBy(DEFAULT_ITEM_COL).count().collect() ] - assert all(user_rating_counts) - assert all(item_rating_counts) + assert all(user_rating_counts) is True + assert all(item_rating_counts) is True @pytest.mark.spark @@ -123,7 +123,7 @@ def test_chrono_splitter(spark_dataset): assert set(users_train) == set(users_test) - assert _if_later(splits[0], splits[1]) + assert _if_later(splits[0], splits[1]) is True splits = spark_chrono_split(spark_dataset, ratio=RATIOS) @@ -131,8 +131,8 @@ def test_chrono_splitter(spark_dataset): assert splits[1].count() / NUM_ROWS == pytest.approx(RATIOS[1], TOL) assert splits[2].count() / NUM_ROWS == pytest.approx(RATIOS[2], TOL) - assert _if_later(splits[0], splits[1]) - assert _if_later(splits[1], splits[2]) + assert _if_later(splits[0], splits[1]) is True + assert _if_later(splits[1], splits[2]) is True @pytest.mark.spark diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py index e2f6dc149..fc26caf59 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation.py @@ -90,7 +90,7 @@ def test_column_dtypes_match(rating_true, rating_pred): col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, ) - + # Drop a column, and there should column mismatch error produced rating_true.drop(DEFAULT_USER_COL, axis="columns", inplace=True) with pytest.raises(ColumnMismatchError): @@ -117,8 +117,8 @@ def test_merge_rating(rating_true, rating_pred): target_y_pred = np.array([14, 12, 7, 8, 13, 6, 11, 5]) assert y_true.shape == y_pred.shape - assert np.all(y_true == target_y_true) - assert np.all(y_pred == target_y_pred) + assert np.all(y_true == target_y_true) is True + assert np.all(y_pred == target_y_pred) is True def test_merge_ranking(rating_true, rating_pred): @@ -375,10 +375,16 @@ def test_python_r_precision(rating_true, rating_pred, rating_nohit): k=10, ) == pytest.approx(1, TOL) assert r_precision_at_k(rating_true, rating_nohit, k=5) == 0.0 - assert r_precision_at_k(rating_true, rating_pred, k=3) == pytest.approx(0.21111, TOL) - assert r_precision_at_k(rating_true, rating_pred, k=5) == pytest.approx(0.24444, TOL) + assert r_precision_at_k(rating_true, rating_pred, k=3) == pytest.approx( + 0.21111, TOL + ) + assert r_precision_at_k(rating_true, rating_pred, k=5) == pytest.approx( + 0.24444, TOL + ) # Equivalent to precision - assert r_precision_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL) + assert r_precision_at_k(rating_true, rating_pred, k=10) == pytest.approx( + 0.37777, TOL + ) def test_python_auc(rating_true_binary, rating_pred_binary): @@ -522,9 +528,7 @@ def test_user_diversity(diversity_data): col_relevance=None, ) assert_frame_equal( - pd.DataFrame( - dict(UserId=[1, 2, 3], user_diversity=[0.29289, 1.0, 0.0]) - ), + pd.DataFrame(dict(UserId=[1, 2, 3], user_diversity=[0.29289, 1.0, 0.0])), actual, check_exact=False, atol=TOL, @@ -625,9 +629,7 @@ def test_user_diversity_item_feature_vector(diversity_data): col_relevance=None, ) assert_frame_equal( - pd.DataFrame( - dict(UserId=[1, 2, 3], user_diversity=[0.5000, 0.5000, 0.5000]) - ), + pd.DataFrame(dict(UserId=[1, 2, 3], user_diversity=[0.5000, 0.5000, 0.5000])), actual, check_exact=False, ) @@ -695,9 +697,7 @@ def test_user_serendipity_item_feature_vector(diversity_data): col_relevance="Relevance", ) assert_frame_equal( - pd.DataFrame( - dict(UserId=[1, 2, 3], user_serendipity=[0.2500, 0.625, 0.3333]) - ), + pd.DataFrame(dict(UserId=[1, 2, 3], user_serendipity=[0.2500, 0.625, 0.3333])), actual, check_exact=False, atol=TOL, diff --git a/tests/unit/recommenders/models/test_deeprec_utils.py b/tests/unit/recommenders/models/test_deeprec_utils.py index 310e4ef3a..1a02852c6 100644 --- a/tests/unit/recommenders/models/test_deeprec_utils.py +++ b/tests/unit/recommenders/models/test_deeprec_utils.py @@ -29,7 +29,8 @@ def test_prepare_hparams(deeprec_resource_path, must_exist_attributes): "xdeepfmresources.zip", ) hparams = prepare_hparams(yaml_file) - assert hasattr(hparams, must_exist_attributes) + + assert hasattr(hparams, must_exist_attributes) is True @pytest.mark.gpu @@ -43,6 +44,6 @@ def test_load_yaml_file(deeprec_resource_path): data_path, "xdeepfmresources.zip", ) - config = load_yaml(yaml_file) + assert config is not None diff --git a/tests/unit/recommenders/models/test_ncf_singlenode.py b/tests/unit/recommenders/models/test_ncf_singlenode.py index 918bd368d..9a7cb170e 100644 --- a/tests/unit/recommenders/models/test_ncf_singlenode.py +++ b/tests/unit/recommenders/models/test_ncf_singlenode.py @@ -92,8 +92,8 @@ def test_regular_save_load(model_type, n_users, n_items): Q_ = model.sess.run(model.embedding_mlp_Q) # test load function - assert np.array_equal(P, P_) - assert np.array_equal(Q, Q_) + assert np.array_equal(P, P_) is True + assert np.array_equal(Q, Q_) is True if os.path.exists(ckpt): shutil.rmtree(ckpt) @@ -132,10 +132,10 @@ def test_neumf_save_load(n_users, n_items): P_mlp_ = model.sess.run(model.embedding_mlp_P) Q_mlp_ = model.sess.run(model.embedding_mlp_Q) - assert np.array_equal(P_gmf, P_gmf_) - assert np.array_equal(Q_gmf, Q_gmf_) - assert np.array_equal(P_mlp, P_mlp_) - assert np.array_equal(Q_mlp, Q_mlp_) + assert np.array_equal(P_gmf, P_gmf_) is True + assert np.array_equal(Q_gmf, Q_gmf_) is True + assert np.array_equal(P_mlp, P_mlp_) is True + assert np.array_equal(Q_mlp, Q_mlp_) is True if os.path.exists(ckpt_gmf): shutil.rmtree(ckpt_gmf) diff --git a/tests/unit/recommenders/models/test_sar_singlenode.py b/tests/unit/recommenders/models/test_sar_singlenode.py index 19e79b233..cc2611477 100644 --- a/tests/unit/recommenders/models/test_sar_singlenode.py +++ b/tests/unit/recommenders/models/test_sar_singlenode.py @@ -23,7 +23,7 @@ def test_init(header): assert model.col_prediction == "prediction" assert model.similarity_type == "jaccard" assert model.time_decay_half_life == 2592000 - assert not model.time_decay_flag + assert model.time_decay_flag is False assert model.time_now is None assert model.threshold == 1 @@ -53,7 +53,7 @@ def test_predict( preds = model.predict(testset) assert len(preds) == 2 - assert isinstance(preds, pd.DataFrame) + assert isinstance(preds, pd.DataFrame) is True assert preds[header["col_user"]].dtype == trainset[header["col_user"]].dtype assert preds[header["col_item"]].dtype == trainset[header["col_item"]].dtype assert preds[DEFAULT_PREDICTION_COL].dtype == trainset[header["col_rating"]].dtype @@ -375,8 +375,8 @@ def test_get_normalized_scores(header): ) assert actual.shape == (2, 7) - assert isinstance(actual, np.ndarray) - assert np.isclose(expected, np.asarray(actual)).all() + assert isinstance(actual, np.ndarray) is True + assert np.isclose(expected, np.asarray(actual)).all() is True def test_match_similarity_type_from_json_file(header): diff --git a/tests/unit/recommenders/models/test_vowpal_wabbit.py b/tests/unit/recommenders/models/test_vowpal_wabbit.py index 0db5bdd70..10a0b212b 100644 --- a/tests/unit/recommenders/models/test_vowpal_wabbit.py +++ b/tests/unit/recommenders/models/test_vowpal_wabbit.py @@ -28,10 +28,10 @@ def model(): def test_vw_init_del(): model = VW() tempdir = model.tempdir.name - assert os.path.exists(tempdir) + assert os.path.exists(tempdir) is True del model - assert not os.path.exists(tempdir) + assert os.path.exists(tempdir) is False @pytest.mark.experimental From fac0873d6a38ba91e913162f08a0defcea912abe Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 13 Jul 2024 10:04:28 +0200 Subject: [PATCH 20/67] unit Signed-off-by: miguelgfierro --- tests/unit/recommenders/tuning/test_ncf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/recommenders/tuning/test_ncf_utils.py b/tests/unit/recommenders/tuning/test_ncf_utils.py index 3f2039bc5..55a3ff086 100644 --- a/tests/unit/recommenders/tuning/test_ncf_utils.py +++ b/tests/unit/recommenders/tuning/test_ncf_utils.py @@ -40,4 +40,4 @@ def test_compute_test_results__return_success(mock_model, fake_movielens_df): [mock_metric_func], [mock_metric_func], ) - assert mock_model.predict.is_called + assert mock_model.predict.is_called is True From bd070a77d7f4ca2a3c3846cfe62f29f4908a9c27 Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Tue, 16 Jul 2024 16:21:13 +0800 Subject: [PATCH 21/67] Try to disable snapshot Signed-off-by: Simon Zhao --- tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py index adda7e172..6618eed42 100644 --- a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py +++ b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py @@ -243,6 +243,7 @@ def create_run_config( ) run_azuremlcompute.environment.python.conda_dependencies = conda_dep + run_azuremlcompute.history.snapshot_project = False return run_azuremlcompute From 304e1b35e759d7c7f4b745317ad0c49f72b70db8 Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Tue, 16 Jul 2024 17:00:26 +0800 Subject: [PATCH 22/67] Change path to run_groupwise_pytest.py Signed-off-by: Simon Zhao --- tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py index 6618eed42..c053f1ee3 100644 --- a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py +++ b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py @@ -325,7 +325,7 @@ def create_arg_parser(): parser.add_argument( "--test", action="store", - default="./tests/ci/azureml_tests/run_groupwise_pytest.py", + default="tests/ci/azureml_tests/run_groupwise_pytest.py", help="location of script to run pytest", ) # max num nodes in Azure cluster From a4ce3cadcf5a12b2f3d9ceb907f6ed89c13ceded Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Tue, 16 Jul 2024 17:54:02 +0800 Subject: [PATCH 23/67] Enable snapshot Signed-off-by: Simon Zhao --- tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py index c053f1ee3..761fe8950 100644 --- a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py +++ b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py @@ -243,7 +243,6 @@ def create_run_config( ) run_azuremlcompute.environment.python.conda_dependencies = conda_dep - run_azuremlcompute.history.snapshot_project = False return run_azuremlcompute From 0967dad53c37f9bc4bf60ee8cc8bc6718d550ca6 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 17 Jul 2024 12:20:21 +0200 Subject: [PATCH 24/67] remove comparison with predefined booleans like np.bool Signed-off-by: miguelgfierro --- .../recommenders/datasets/test_movielens.py | 4 ++-- .../recommenders/datasets/test_download_utils.py | 12 ++++++------ .../evaluation/test_python_evaluation.py | 4 ++-- tests/unit/recommenders/models/test_vowpal_wabbit.py | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_movielens.py b/tests/data_validation/recommenders/datasets/test_movielens.py index 947e7629b..5af7e9673 100644 --- a/tests/data_validation/recommenders/datasets/test_movielens.py +++ b/tests/data_validation/recommenders/datasets/test_movielens.py @@ -126,8 +126,8 @@ def test_download_and_extract_movielens(size, tmp): ) # Test if raw-zip file, rating file, and item file are cached assert len(os.listdir(tmp)) == 3 - assert os.path.exists(rating_path) is True - assert os.path.exists(item_path) is True + assert os.path.exists(rating_path) + assert os.path.exists(item_path) @pytest.mark.parametrize( diff --git a/tests/unit/recommenders/datasets/test_download_utils.py b/tests/unit/recommenders/datasets/test_download_utils.py index 305dc1ecb..8de69b891 100644 --- a/tests/unit/recommenders/datasets/test_download_utils.py +++ b/tests/unit/recommenders/datasets/test_download_utils.py @@ -25,7 +25,7 @@ def test_maybe_download(files_fixtures): os.remove(filepath) downloaded_filepath = maybe_download(file_url, "license.txt", expected_bytes=1212) - assert os.path.exists(downloaded_filepath) is True + assert os.path.exists(downloaded_filepath) assert os.path.basename(downloaded_filepath) == "license.txt" @@ -51,7 +51,7 @@ def test_maybe_download_maybe(caplog, files_fixtures): os.remove(filepath) downloaded_filepath = maybe_download(file_url, "license.txt") - assert os.path.exists(downloaded_filepath) is True + assert os.path.exists(downloaded_filepath) maybe_download(file_url, "license.txt") assert "File ." + os.path.sep + "license.txt already downloaded" in caplog.text @@ -69,11 +69,11 @@ def test_maybe_download_retry(caplog): def test_download_path(): # Check that the temporal path is created and deleted with download_path() as path: - assert os.path.isdir(path) is True - assert os.path.isdir(path) is False + assert os.path.isdir(path) + assert not os.path.isdir(path) # Check the behavior when a path is provided tmp_dir = TemporaryDirectory() with download_path(tmp_dir.name) as path: - assert os.path.isdir(path) is True - assert os.path.isdir(path) is False + assert os.path.isdir(path) + assert not os.path.isdir(path) diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py index fc26caf59..8702f40ec 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation.py @@ -117,8 +117,8 @@ def test_merge_rating(rating_true, rating_pred): target_y_pred = np.array([14, 12, 7, 8, 13, 6, 11, 5]) assert y_true.shape == y_pred.shape - assert np.all(y_true == target_y_true) is True - assert np.all(y_pred == target_y_pred) is True + assert np.all(y_true == target_y_true) + assert np.all(y_pred == target_y_pred) def test_merge_ranking(rating_true, rating_pred): diff --git a/tests/unit/recommenders/models/test_vowpal_wabbit.py b/tests/unit/recommenders/models/test_vowpal_wabbit.py index 10a0b212b..0db5bdd70 100644 --- a/tests/unit/recommenders/models/test_vowpal_wabbit.py +++ b/tests/unit/recommenders/models/test_vowpal_wabbit.py @@ -28,10 +28,10 @@ def model(): def test_vw_init_del(): model = VW() tempdir = model.tempdir.name - assert os.path.exists(tempdir) is True + assert os.path.exists(tempdir) del model - assert os.path.exists(tempdir) is False + assert not os.path.exists(tempdir) @pytest.mark.experimental From 9c83202fb745f33fc1f50f5e81bbe02e5e1fc5ac Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 17 Jul 2024 12:43:36 +0200 Subject: [PATCH 25/67] :bug: Signed-off-by: miguelgfierro --- .../recommenders/datasets/test_movielens.py | 6 +++--- .../recommenders/datasets/test_download_utils.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_movielens.py b/tests/data_validation/recommenders/datasets/test_movielens.py index 5af7e9673..12ca20a7c 100644 --- a/tests/data_validation/recommenders/datasets/test_movielens.py +++ b/tests/data_validation/recommenders/datasets/test_movielens.py @@ -117,7 +117,7 @@ def test_download_and_extract_movielens(size, tmp): zip_path = os.path.join(tmp, "ml.zip") download_movielens(size, dest_path=zip_path) assert len(os.listdir(tmp)) == 1 - assert os.path.exists(zip_path) + assert os.path.exists(zip_path) is True rating_path = os.path.join(tmp, "rating.dat") item_path = os.path.join(tmp, "item.dat") @@ -126,8 +126,8 @@ def test_download_and_extract_movielens(size, tmp): ) # Test if raw-zip file, rating file, and item file are cached assert len(os.listdir(tmp)) == 3 - assert os.path.exists(rating_path) - assert os.path.exists(item_path) + assert os.path.exists(rating_path) is True + assert os.path.exists(item_path) is True @pytest.mark.parametrize( diff --git a/tests/unit/recommenders/datasets/test_download_utils.py b/tests/unit/recommenders/datasets/test_download_utils.py index 8de69b891..11309e5cf 100644 --- a/tests/unit/recommenders/datasets/test_download_utils.py +++ b/tests/unit/recommenders/datasets/test_download_utils.py @@ -25,7 +25,7 @@ def test_maybe_download(files_fixtures): os.remove(filepath) downloaded_filepath = maybe_download(file_url, "license.txt", expected_bytes=1212) - assert os.path.exists(downloaded_filepath) + assert os.path.exists(downloaded_filepath) is True assert os.path.basename(downloaded_filepath) == "license.txt" @@ -51,7 +51,7 @@ def test_maybe_download_maybe(caplog, files_fixtures): os.remove(filepath) downloaded_filepath = maybe_download(file_url, "license.txt") - assert os.path.exists(downloaded_filepath) + assert os.path.exists(downloaded_filepath) is True maybe_download(file_url, "license.txt") assert "File ." + os.path.sep + "license.txt already downloaded" in caplog.text @@ -69,11 +69,11 @@ def test_maybe_download_retry(caplog): def test_download_path(): # Check that the temporal path is created and deleted with download_path() as path: - assert os.path.isdir(path) - assert not os.path.isdir(path) + assert os.path.isdir(path) is True + assert os.path.isdir(path) is False # Check the behavior when a path is provided tmp_dir = TemporaryDirectory() with download_path(tmp_dir.name) as path: - assert os.path.isdir(path) - assert not os.path.isdir(path) + assert os.path.isdir(path) is True + assert os.path.isdir(path) is True From b44c0f48ce4839339e717610b80838ada3e6702d Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 17 Jul 2024 12:46:49 +0200 Subject: [PATCH 26/67] remove comparison with predefined booleans like np.bool Signed-off-by: miguelgfierro --- .../unit/recommenders/models/test_ncf_singlenode.py | 12 ++++++------ .../unit/recommenders/models/test_sar_singlenode.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/unit/recommenders/models/test_ncf_singlenode.py b/tests/unit/recommenders/models/test_ncf_singlenode.py index 9a7cb170e..918bd368d 100644 --- a/tests/unit/recommenders/models/test_ncf_singlenode.py +++ b/tests/unit/recommenders/models/test_ncf_singlenode.py @@ -92,8 +92,8 @@ def test_regular_save_load(model_type, n_users, n_items): Q_ = model.sess.run(model.embedding_mlp_Q) # test load function - assert np.array_equal(P, P_) is True - assert np.array_equal(Q, Q_) is True + assert np.array_equal(P, P_) + assert np.array_equal(Q, Q_) if os.path.exists(ckpt): shutil.rmtree(ckpt) @@ -132,10 +132,10 @@ def test_neumf_save_load(n_users, n_items): P_mlp_ = model.sess.run(model.embedding_mlp_P) Q_mlp_ = model.sess.run(model.embedding_mlp_Q) - assert np.array_equal(P_gmf, P_gmf_) is True - assert np.array_equal(Q_gmf, Q_gmf_) is True - assert np.array_equal(P_mlp, P_mlp_) is True - assert np.array_equal(Q_mlp, Q_mlp_) is True + assert np.array_equal(P_gmf, P_gmf_) + assert np.array_equal(Q_gmf, Q_gmf_) + assert np.array_equal(P_mlp, P_mlp_) + assert np.array_equal(Q_mlp, Q_mlp_) if os.path.exists(ckpt_gmf): shutil.rmtree(ckpt_gmf) diff --git a/tests/unit/recommenders/models/test_sar_singlenode.py b/tests/unit/recommenders/models/test_sar_singlenode.py index cc2611477..d747e30be 100644 --- a/tests/unit/recommenders/models/test_sar_singlenode.py +++ b/tests/unit/recommenders/models/test_sar_singlenode.py @@ -376,7 +376,7 @@ def test_get_normalized_scores(header): assert actual.shape == (2, 7) assert isinstance(actual, np.ndarray) is True - assert np.isclose(expected, np.asarray(actual)).all() is True + assert np.isclose(expected, np.asarray(actual)).all() def test_match_similarity_type_from_json_file(header): From a441cb4bbf4bc6a9f6f20c025ebafd1b3448299d Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 17 Jul 2024 13:14:39 +0200 Subject: [PATCH 27/67] :bug: Signed-off-by: miguelgfierro --- tests/unit/recommenders/tuning/test_ncf_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/recommenders/tuning/test_ncf_utils.py b/tests/unit/recommenders/tuning/test_ncf_utils.py index 55a3ff086..3f2039bc5 100644 --- a/tests/unit/recommenders/tuning/test_ncf_utils.py +++ b/tests/unit/recommenders/tuning/test_ncf_utils.py @@ -40,4 +40,4 @@ def test_compute_test_results__return_success(mock_model, fake_movielens_df): [mock_metric_func], [mock_metric_func], ) - assert mock_model.predict.is_called is True + assert mock_model.predict.is_called From b523e8230194f8d69f0e73e8dc813ee1679d75ae Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 30 Jul 2024 09:35:42 +0200 Subject: [PATCH 28/67] Added free course on Recommendation systems Signed-off-by: miguelgfierro --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 74805200d..72ee78004 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,7 @@ The nightly build tests are run daily on AzureML. ## References +- **FREE COURSE**: M. González-Fierro, "Recommendation Systems: A Practical Introduction", LinkedIn Learning, 2024. [Available on this link](https://www.linkedin.com/learning/recommendation-systems-a-practical-introduction). - D. Li, J. Lian, L. Zhang, K. Ren, D. Lu, T. Wu, X. Xie, "Recommender Systems: Frontiers and Practices", Springer, Beijing, 2024. [Available on this link](https://www.amazon.com/Recommender-Systems-Frontiers-Practices-Dongsheng/dp/9819989639/). - A. Argyriou, M. González-Fierro, and L. Zhang, "Microsoft Recommenders: Best Practices for Production-Ready Recommendation Systems", *WWW 2020: International World Wide Web Conference Taipei*, 2020. Available online: https://dl.acm.org/doi/abs/10.1145/3366424.3382692 - S. Graham, J.K. Min, T. Wu, "Microsoft recommenders: tools to accelerate developing recommender systems", *RecSys '19: Proceedings of the 13th ACM Conference on Recommender Systems*, 2019. Available online: https://dl.acm.org/doi/10.1145/3298689.3346967 From f6d3e6be7a7da9171eda4bdaad5174a0af0345d5 Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Wed, 31 Jul 2024 13:38:21 +0800 Subject: [PATCH 29/67] Migrate AML SDK from v1 to v2 (#2134) * Migrate AML SDK from v1 to v2 Signed-off-by: Simon Zhao * Correct MLClient constructor paramenter names Signed-off-by: Simon Zhao * Remove unsupported operation begin_start() on AmlCompute Signed-off-by: Simon Zhao * Add label for environments.get() Signed-off-by: Simon Zhao * Remove environment get Signed-off-by: Simon Zhao * Update Signed-off-by: Simon Zhao * Correct experiment and environment names Signed-off-by: Simon Zhao * Correct compute Signed-off-by: Simon Zhao * Create Conda env inside Dockerfile Signed-off-by: Simon Zhao * Catch ResourceNotFoundError Signed-off-by: Simon Zhao * Correct experiment name Signed-off-by: Simon Zhao * Update env creation and job running Signed-off-by: Simon Zhao * Try waiting for completion by stream Signed-off-by: Simon Zhao * Try to fix conda activate Signed-off-by: Simon Zhao * Import sys Signed-off-by: Simon Zhao * Change logging level Signed-off-by: Simon Zhao * Exit directly once pytest fails Signed-off-by: Simon Zhao * Set numpy<2.0.0 due to issue of cornac Signed-off-by: Simon Zhao * Correct Dockerfile Signed-off-by: Simon Zhao * Change heredoc Signed-off-by: Simon Zhao * Set dockerfile version Signed-off-by: Simon Zhao * Copy environment.yml to container Signed-off-by: Simon Zhao * Update .github/actions/azureml-test/action.yml Co-authored-by: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> * Update as suggested by Andreas * Print pytest logs Signed-off-by: Simon Zhao * Group outputs Signed-off-by: Simon Zhao * Install pip in conda env, and show pytest warnings Signed-off-by: Simon Zhao * Add command name Signed-off-by: Simon Zhao * Update Signed-off-by: Simon Zhao * Show warnings in pytest Signed-off-by: Simon Zhao * Show warnings in pytest Signed-off-by: Simon Zhao --------- Signed-off-by: Simon Zhao Co-authored-by: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> --- .github/actions/azureml-test/action.yml | 117 ++--- .github/actions/get-test-groups/action.yml | 9 +- .github/workflows/azureml-cpu-nightly.yml | 8 +- .github/workflows/azureml-gpu-nightly.yml | 8 +- .github/workflows/azureml-spark-nightly.yml | 8 +- .github/workflows/azureml-unit-tests.yml | 7 +- setup.py | 1 + tests/ci/azureml_tests/aml_utils.py | 198 +++++++ tests/ci/azureml_tests/post_pytest.py | 96 ++++ .../ci/azureml_tests/run_groupwise_pytest.py | 81 +-- .../submit_groupwise_azureml_pytest.py | 494 +++--------------- 11 files changed, 472 insertions(+), 555 deletions(-) create mode 100644 tests/ci/azureml_tests/aml_utils.py create mode 100644 tests/ci/azureml_tests/post_pytest.py diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml index 85ae9f84a..91a437719 100644 --- a/.github/actions/azureml-test/action.yml +++ b/.github/actions/azureml-test/action.yml @@ -6,64 +6,39 @@ name: azureml-tests description: "Submit experiment to AzureML cluster" inputs: - # azureml experiment name EXP_NAME: required: true - type: string - # type of test - unit or nightly + description: AzureML experiment Name + ENV_NAME: + required: true + description: AzureML environment Name TEST_KIND: required: true - type: string - # test environment - cpu, gpu or spark - TEST_ENV: - required: false - type: string - # azureml compute credentials + description: Type of test - unit or nightly AZUREML_TEST_CREDENTIALS: required: true - type: string - # azureml compute subid + description: Credentials for AzureML login AZUREML_TEST_SUBID: required: true - type: string - # python version + description: AzureML subscription ID PYTHON_VERSION: required: true - type: string - # test group name + description: Python version used for the tests TEST_GROUP: required: true - type: string - # cpu cluster name - CPU_CLUSTER_NAME: - required: false - type: string - default: "cpu-cluster" - # gpu cluster name - GPU_CLUSTER_NAME: - required: false - type: string - default: "gpu-cluster" - # AzureML resource group name + description: Test group defined in test_group.py RG: required: false - type: string + description: AzureML resource group name default: "recommenders_project_resources" - # AzureML workspace name WS: required: false - type: string + description: AzureML workspace name default: "azureml-test-workspace" - # test logs path - TEST_LOGS_PATH: - required: false - type: string - default: '"test_logs.log"' - # pytest exit code - PYTEST_EXIT_CODE: + LOG_DIR: required: false - type: string - default: "pytest_exit_code.log" + description: Directory storing the test logs + default: "test_logs" runs: using: "composite" @@ -71,43 +46,45 @@ runs: - name: Setup python uses: actions/setup-python@v5 with: - python-version: "3.8" - - name: Install azureml-core and azure-cli on a GitHub hosted server + python-version: "3.10" + - name: Install AzureML Python SDK shell: bash - run: pip install --quiet "azureml-core>1,<2" "azure-cli>2,<3" + run: pip install --quiet "azure-ai-ml>1,<2" mlflow "azureml-mlflow>1,<2" - name: Log in to Azure uses: azure/login@v2 with: - creds: ${{inputs.AZUREML_TEST_CREDENTIALS}} - - name: Install wheel package - shell: bash - run: pip install --quiet wheel + creds: ${{ inputs.AZUREML_TEST_CREDENTIALS }} - name: Submit tests to AzureML shell: bash - run: >- + run: | + echo "::group::Running tests ..." python tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py \ - --subid ${{inputs.AZUREML_TEST_SUBID}} \ - --reponame "recommenders" \ - --branch ${{ github.ref }} \ - --rg ${{inputs.RG}} \ - --wsname ${{inputs.WS}} \ - --expname ${{inputs.EXP_NAME}}_${{inputs.TEST_GROUP}} \ - --testlogs ${{inputs.TEST_LOGS_PATH}} \ - --testkind ${{inputs.TEST_KIND}} \ - --conda_pkg_python ${{inputs.PYTHON_VERSION}} \ - --testgroup ${{inputs.TEST_GROUP}} \ - --disable-warnings \ - --sha "${GITHUB_SHA}" \ - --clustername $(if [[ ${{inputs.TEST_GROUP}} =~ "gpu" ]]; then echo "${{inputs.GPU_CLUSTER_NAME}}"; else echo "${{inputs.CPU_CLUSTER_NAME}}"; fi) \ - $(if [[ ${{inputs.TEST_GROUP}} =~ "gpu" ]]; then echo "--add_gpu_dependencies"; fi) \ - $(if [[ ${{inputs.TEST_GROUP}} =~ "spark" ]]; then echo "--add_spark_dependencies"; fi) - - name: Get exit status + --subid ${{ inputs.AZUREML_TEST_SUBID }} \ + --rg ${{ inputs.RG }} \ + --ws ${{ inputs.WS }} \ + --cluster ${{ contains(inputs.TEST_GROUP, 'gpu') && 'gpu-cluster' || 'cpu-cluster' }} \ + --expname ${{ inputs.EXP_NAME }} \ + --envname ${{ inputs.ENV_NAME }} \ + --testkind ${{ inputs.TEST_KIND}} \ + --python-version ${{ inputs.PYTHON_VERSION }} \ + --testgroup ${{ inputs.TEST_GROUP }} \ + --sha ${GITHUB_SHA} + echo "::endgroup::" + - name: Post tests + if: ${{ ! cancelled() }} shell: bash - id: exit_status - run: echo "code=$(cat ${{inputs.PYTEST_EXIT_CODE}})" >> $GITHUB_OUTPUT - - name: Check Success/Failure - if: ${{ steps.exit_status.outputs.code != 0 }} - uses: actions/github-script@v7 + run: | + echo "::group::Pytest logs" + python tests/ci/azureml_tests/post_pytest.py \ + --subid ${{ inputs.AZUREML_TEST_SUBID }} \ + --rg ${{ inputs.RG }} \ + --ws ${{ inputs.WS }} \ + --expname ${{ inputs.EXP_NAME }} \ + --log-dir ${{ inputs.LOG_DIR }} + echo "::endgroup::" + - name: Save logs + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 with: - script: | - core.setFailed('All tests did not pass!') + name: logs-${{ inputs.TEST_GROUP }}-python${{ inputs.PYTHON_VERSION }} + path: ${{ inputs.LOG_DIR }} diff --git a/.github/actions/get-test-groups/action.yml b/.github/actions/get-test-groups/action.yml index 39364fa81..dc50e4b93 100644 --- a/.github/actions/get-test-groups/action.yml +++ b/.github/actions/get-test-groups/action.yml @@ -6,18 +6,17 @@ name: get-test-groups description: "Get test group names from tests_groups.py" inputs: - # type of test - unit or nightly TEST_KIND: required: true - type: string - # test environment - cpu, gpu or spark + description: Type of test - unit or nightly TEST_ENV: required: false - type: string + description: Test environment - cpu, gpu or spark default: 'cpu' outputs: test_groups: - value: ${{steps.get_test_groups.outputs.test_groups}} + description: A list of test groups + value: ${{ steps.get_test_groups.outputs.test_groups }} runs: using: "composite" diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml index 93e414564..89fc64757 100644 --- a/.github/workflows/azureml-cpu-nightly.yml +++ b/.github/workflows/azureml-cpu-nightly.yml @@ -34,7 +34,7 @@ on: # Enable manual trigger workflow_dispatch: - input: + inputs: tags: description: 'Tags to label this manual run (optional)' default: 'Manual trigger' @@ -67,7 +67,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] + python-version: ["3.8", "3.9", "3.10", "3.11"] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code @@ -76,9 +76,9 @@ jobs: uses: ./.github/actions/azureml-test id: execute_tests with: - EXP_NAME: 'nightly_tests' + EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} + ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - TEST_ENV: 'cpu' AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml index 3b9f6d6b4..16e3e6ed2 100644 --- a/.github/workflows/azureml-gpu-nightly.yml +++ b/.github/workflows/azureml-gpu-nightly.yml @@ -34,7 +34,7 @@ on: # Enable manual trigger workflow_dispatch: - input: + inputs: tags: description: 'Tags to label this manual run (optional)' default: 'Manual trigger' @@ -67,7 +67,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] + python-version: ["3.8", "3.9", "3.10", "3.11"] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code @@ -76,9 +76,9 @@ jobs: uses: ./.github/actions/azureml-test id: execute_tests with: - EXP_NAME: 'nightly_tests' + EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} + ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - TEST_ENV: 'gpu' AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml index 8f28be6f2..97789fccf 100644 --- a/.github/workflows/azureml-spark-nightly.yml +++ b/.github/workflows/azureml-spark-nightly.yml @@ -33,7 +33,7 @@ on: # Enable manual trigger workflow_dispatch: - input: + inputs: tags: description: 'Tags to label this manual run (optional)' default: 'Manual trigger' @@ -66,7 +66,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] + python-version: ["3.8", "3.9", "3.10", "3.11"] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code @@ -75,9 +75,9 @@ jobs: uses: ./.github/actions/azureml-test id: execute_tests with: - EXP_NAME: 'nightly_tests' + EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} + ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - TEST_ENV: 'spark' AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index b39268318..ed3b5a98d 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -23,7 +23,7 @@ on: # Enable manual trigger workflow_dispatch: - input: + inputs: tags: description: 'Tags to label this manual run (optional)' default: 'Manual trigger' @@ -56,7 +56,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] + python-version: ["3.8", "3.9", "3.10", "3.11"] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code @@ -65,7 +65,8 @@ jobs: uses: ./.github/actions/azureml-test id: execute_tests with: - EXP_NAME: 'unit_tests' + EXP_NAME: recommenders-unit-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.sha }} + ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'unit' AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} diff --git a/setup.py b/setup.py index 631d6cd83..03df519ed 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ "nltk>=3.8.1,<4", # requires tqdm "notebook>=6.5.5,<8", # requires ipykernel, jinja2, jupyter, nbconvert, nbformat, packaging, requests "numba>=0.57.0,<1", + "numpy<2.0.0", # FIXME: Remove numpy<2.0.0 once cornac release a version newer than 2.2.1 that resolve ImportError: numpy.core.multiarray failed to import. "pandas>2.0.0,<3.0.0", # requires numpy "pandera[strategies]>=0.6.5,<0.18;python_version<='3.8'", # For generating fake datasets "pandera[strategies]>=0.15.0;python_version>='3.9'", diff --git a/tests/ci/azureml_tests/aml_utils.py b/tests/ci/azureml_tests/aml_utils.py new file mode 100644 index 000000000..d24ec1361 --- /dev/null +++ b/tests/ci/azureml_tests/aml_utils.py @@ -0,0 +1,198 @@ +# Copyright (c) Recommenders contributors. +# Licensed under the MIT License. + +""" +This module includes utilities for tests on AzureML via AML Python SDK v2. +See +* https://learn.microsoft.com/en-us/azure/machine-learning/concept-v2?view=azureml-api-2 +* https://learn.microsoft.com/en-us/azure/machine-learning/reference-migrate-sdk-v1-mlflow-tracking?view=azureml-api-2&tabs=aml%2Ccli%2Cmlflow +""" +import pathlib +import tempfile + +from azure.ai.ml import MLClient, command +from azure.ai.ml.entities import AmlCompute, BuildContext, Environment, Workspace +from azure.ai.ml.exceptions import JobException +from azure.core.exceptions import ResourceExistsError +from azure.identity import DefaultAzureCredential + +def get_client(subscription_id, resource_group, workspace_name): + """ + Get the client with specified AzureML workspace, or create one if not existing. + See https://github.com/Azure/azureml-examples/blob/main/sdk/python/resources/workspace/workspace.ipynb + """ + params = dict( + credential=DefaultAzureCredential(), + subscription_id=subscription_id, + resource_group_name=resource_group, + ) + client = MLClient(**params) + + workspace = client.workspaces.get(workspace_name) + if workspace is None: + workspace = client.workspaces.begin_create( + Workspace(name=workspace_name) + ).result() + + params["workspace_name"] = workspace_name + client = MLClient(**params) + return client + + +def create_or_start_compute(client, name, size, max_instances): + """ + Start the specified compute. + See https://github.com/Azure/azureml-examples/blob/main/sdk/python/resources/compute/compute.ipynb + """ + compute = client.compute.get(name) + if compute is None: + compute = client.compute.begin_create_or_update( + AmlCompute( + name=name, + type="amlcompute", + size=size, + max_instances=max_instances, + ) + ).result() + + +def get_or_create_environment( + client, + environment_name, + use_gpu, + use_spark, + conda_pkg_jdk, + python_version, + commit_sha, +): + """ + AzureML requires the run environment to be setup prior to submission. + This configures a docker persistent compute. + See https://github.com/Azure/azureml-examples/blob/main/sdk/python/assets/environment/environment.ipynb + + Args: + client (MLClient): the client to interact with AzureML services + environment_name (str): Environment name + use_gpu (bool): True if gpu packages should be + added to the conda environment, else False + use_spark (bool): True if PySpark packages should be + added to the conda environment, else False + conda_pkg_jdk (str): "openjdk=8" by default + python_version (str): python version, such as "3.9" + commit_sha (str): the commit that triggers the workflow + """ + conda_env_name = "reco" + conda_env_yml = "environment.yml" + condafile = fr""" +name: {conda_env_name} +channels: + - conda-forge +dependencies: + - python={python_version} + - {conda_pkg_jdk} + - pip + - pip: + - pymanopt@https://github.com/pymanopt/pymanopt/archive/fb36a272cdeecb21992cfd9271eb82baafeb316d.zip + - recommenders[dev{",gpu" if use_gpu else ""}{",spark" if use_spark else ""}]@git+https://github.com/recommenders-team/recommenders.git@{commit_sha} +""" + # See https://github.com/Azure/AzureML-Containers/blob/master/base/cpu/openmpi4.1.0-ubuntu22.04 + image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04" + # See https://github.com/Azure/AzureML-Containers/blob/master/base/gpu/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 + dockerfile = fr"""# syntax=docker/dockerfile:1 +FROM nvcr.io/nvidia/cuda:12.5.1-devel-ubuntu22.04 +SHELL ["/bin/bash", "-c"] +USER root:root +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && \ + apt-get install -y wget git-all && \ + apt-get clean -y && \ + rm -rf /var/lib/apt/lists/* + +# Install Conda +ENV CONDA_PREFIX /opt/miniconda +RUN wget -qO /tmp/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_24.5.0-0-Linux-x86_64.sh && \ + bash /tmp/miniconda.sh -bf -p ${{CONDA_PREFIX}} && \ + ${{CONDA_PREFIX}}/bin/conda update --all -c conda-forge -y && \ + ${{CONDA_PREFIX}}/bin/conda clean -ay && \ + rm -rf ${{CONDA_PREFIX}}/pkgs && \ + rm /tmp/miniconda.sh && \ + find / -type d -name __pycache__ | xargs rm -rf + +# Create Conda environment +COPY {conda_env_yml} /tmp/{conda_env_yml} +RUN ${{CONDA_PREFIX}}/bin/conda env create -f /tmp/{conda_env_yml} + +# Activate Conda environment +ENV CONDA_DEFAULT_ENV {conda_env_name} +ENV CONDA_PREFIX ${{CONDA_PREFIX}}/envs/${{CONDA_DEFAULT_ENV}} +ENV PATH="${{CONDA_PREFIX}}/bin:${{PATH}}" LD_LIBRARY_PATH="${{CONDA_PREFIX}}/lib:$LD_LIBRARY_PATH" +""" + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = pathlib.Path(tmpdir) + dockerfile_path = tmpdir / "Dockerfile" + condafile_path = tmpdir / conda_env_yml + build = BuildContext(path=tmpdir, dockerfile_path=dockerfile_path.name) + + with open(dockerfile_path, "w") as file: + file.write(dockerfile) + with open(condafile_path, "w") as file: + file.write(condafile) + + try: + client.environments.create_or_update( + Environment( + name=environment_name, + image=None if use_gpu else image, + build=build if use_gpu else None, + conda_file=None if use_gpu else condafile_path, + ) + ) + except ResourceExistsError: + pass + + +def run_tests( + client, + compute, + environment_name, + experiment_name, + script, + testgroup, + testkind, +): + """ + Pytest on AzureML compute. + See https://github.com/Azure/azureml-examples/blob/main/sdk/python/jobs/single-step/debug-and-monitor/debug-and-monitor.ipynb + """ + job = client.jobs.create_or_update( + command( + experiment_name=experiment_name, + compute=compute, + environment=f"{environment_name}@latest", + code="./", + command=( + f"python {script} " + f"--expname {experiment_name} " + f"--testgroup {testgroup} " + f"--testkind {testkind}" + ), + ) + ) + client.jobs.stream(job.name) + job = client.jobs.get(job.name) + if job.status != "Completed": + raise JobException("Job Not Completed!") + + +def correct_resource_name(resource_name): + """ + Resource name can only contain alphanumeric characters, dashes, and + underscores, with a limit of 255 characters. + """ + name = resource_name.replace(".", "_") + name = name.replace("/", "_") + return name diff --git a/tests/ci/azureml_tests/post_pytest.py b/tests/ci/azureml_tests/post_pytest.py new file mode 100644 index 000000000..b457e709d --- /dev/null +++ b/tests/ci/azureml_tests/post_pytest.py @@ -0,0 +1,96 @@ +# Copyright (c) Recommenders contributors. +# Licensed under the MIT License. + +""" +This Python script completes post test tasks such as downloading logs. +""" + +import argparse +import mlflow +import logging +import pathlib + +from aml_utils import get_client, correct_resource_name + + +def parse_args(): + """ + Parse command line arguments. + """ + + parser = argparse.ArgumentParser(description="Process some inputs") + + parser.add_argument( + "--rg", action="store", + default="recommender", + help="Azure Resource Group", + ) + parser.add_argument( + "--ws", action="store", + default="RecoWS", + help="AzureML workspace name", + ) + parser.add_argument( + "--subid", + action="store", + default="123456", + help="Azure Subscription ID", + ) + parser.add_argument( + "--expname", + action="store", + default="persistentAzureML", + help="Experiment name on AzureML", + ) + parser.add_argument( + "--log-dir", + action="store", + default="test_logs", + help="Test logs will be downloaded to this path", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + logger = logging.getLogger("post_pytest.py") + args = parse_args() + + logger.info(f"Setting up workspace {args.ws}") + client = get_client( + subscription_id=args.subid, + resource_group=args.rg, + workspace_name=args.ws, + ) + + # See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-mlflow-configure-tracking?view=azureml-api-2&tabs=python%2Cmlflow#configure-mlflow-tracking-uri + logger.info(f"Configuring mlflow") + mlflow.set_tracking_uri( + client.workspaces.get(client.workspace_name).mlflow_tracking_uri + ) + + # See https://learn.microsoft.com/en-us/azure/machine-learning/how-to-track-experiments-mlflow?view=azureml-api-2 + logger.info(f"Searching runs") + experiment_name = correct_resource_name(args.expname) + runs = mlflow.search_runs( + experiment_names=[experiment_name], + max_results=1, + output_format="list", + ) + if runs: + run = runs[0] + + # See https://www.mlflow.org/docs/latest/python_api/mlflow.artifacts.html#mlflow.artifacts.download_artifacts + # For more details on logs, see + # * https://learn.microsoft.com/en-us/azure/machine-learning/how-to-log-view-metrics?view=azureml-api-2&tabs=interactive#view-and-download-diagnostic-logs + # * https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/debugging/ + logger.info(f"Downloading AzureML logs") + mlflow.artifacts.download_artifacts( + run_id=run.info.run_id, + dst_path=args.log_dir, + ) + log_path = pathlib.Path("user_logs/std_log.txt") + with open(pathlib.Path(args.log_dir) / log_path, "r") as file: + print(f"\nDumping logs in {log_path}") + print("=====================================") + print(file.read()) diff --git a/tests/ci/azureml_tests/run_groupwise_pytest.py b/tests/ci/azureml_tests/run_groupwise_pytest.py index 92e1ee2bd..8a97fa481 100644 --- a/tests/ci/azureml_tests/run_groupwise_pytest.py +++ b/tests/ci/azureml_tests/run_groupwise_pytest.py @@ -2,90 +2,57 @@ # Licensed under the MIT License. """ -run_pytest.py is the script submitted to Azure ML that runs pytest. +run_groupwise_pytest.py is the script submitted to Azure ML that runs pytest. pytest runs all tests in the specified test folder unless parameters are set otherwise. """ -import sys +import argparse import logging import pytest -import argparse -import glob -import pkg_resources -from azureml.core import Run -from test_groups import nightly_test_groups, pr_gate_test_groups - -if __name__ == "__main__": +import sys - logger = logging.getLogger("submit_groupwise_azureml_pytest.py") - logging.basicConfig(stream=sys.stdout, level=logging.INFO) +from test_groups import nightly_test_groups, pr_gate_test_groups +def parse_args(): + """ + Parse command line arguments. + """ parser = argparse.ArgumentParser(description="Process inputs") + parser.add_argument( + "--expname", + action="store", + default="persistentAzureML", + help="Experiment name on AzureML", + ) parser.add_argument( "--testkind", - "-k", action="store", default="unit", help="Test kind - nightly or unit", ) parser.add_argument( "--testgroup", - "-g", action="store", default="group_cpu_001", help="Group name for the tests", ) - # Flag to indicate whether to turn off the warnings - parser.add_argument( - "--disable-warnings", - action="store_true", - help="Turn off warnings", - ) - args = parser.parse_args() + return parser.parse_args() + +if __name__ == "__main__": + + logger = logging.getLogger("run_groupwise_pytest.py") + + args = parse_args() if args.testkind == "nightly": test_group = nightly_test_groups[args.testgroup] else: test_group = pr_gate_test_groups[args.testgroup] - logger.info(f"Python version: {sys.version}") - - logger.info("Installed packages:") - for p in pkg_resources.working_set: - logger.info(f" {p.project_name}:{p.version}") - - logger.info("Tests to be executed") - logger.info(str(test_group)) - - # Run.get_context() is needed to save context as pytest causes corruption - # of env vars - run = Run.get_context() - - logger.info("Executing tests now...") - - # Add options to pytest command (Duration and disable warnings) + # Add options to pytest command (Duration) pytest_string = test_group + ["--durations"] + ["0"] - if args.disable_warnings is True: - pytest_string += ["--disable-warnings"] # Execute pytest command - pytest_exit_code = pytest.main(pytest_string) - - logger.info("Test execution completed!") - - # log pytest exit code as a metric - # to be used to indicate success/failure in github workflow - run.log("pytest_exit_code", pytest_exit_code.value) - - # # - # # Leveraged code from this notebook: - # # https://msdata.visualstudio.com/Vienna/_search?action=contents&text=upload_folder&type=code&lp=code-Project&filters=ProjectFilters%7BVienna%7DRepositoryFilters%7BAzureMlCli%7D&pageSize=25&sortOptions=%5B%7B%22field%22%3A%22relevance%22%2C%22sortOrder%22%3A%22desc%22%7D%5D&result=DefaultCollection%2FVienna%2FAzureMlCli%2FGBmaster%2F%2Fsrc%2Fazureml-core%2Fazureml%2Fcore%2Frun.py - # logger.info("os.listdir files {}".format(os.listdir("."))) - - # upload pytest stdout file - logs_path = ( - glob.glob("**/70_driver_log.txt", recursive=True) - + glob.glob("**/std_log.txt", recursive=True) - )[0] - run.upload_file(name="test_logs", path_or_stream=logs_path) + logger.info("Executing tests now...") + sys.exit(pytest.main(pytest_string)) diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py index 761fe8950..4ce6106bf 100644 --- a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py +++ b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py @@ -4,7 +4,7 @@ """ This python script sets up an environment on AzureML and submits a script to it to run pytest. It is usually intended to be used as -part of a DevOps pipeline which runs testing on a github repo but +part of a DevOps pipeline which runs testing on a GitHub repo but can also be used from command line. Many parameters are set to default values and some are expected to be passed @@ -14,418 +14,110 @@ Args: - Required: - --clustername (str): the Azure cluster for this run. It can already exist - or it will be created. - --subid (str): the Azure subscription id - - Optional but suggested, this info will be stored on Azure as - text information as part of the experiment: - --pr (str): the Github PR number - --reponame (str): the Github repository name - --branch (str): the branch being run - It is also possible to put any text string in these. + See parse_args() below for more details. Example: Usually, this script is run by a DevOps pipeline. It can also be run from cmd line. >>> python tests/ci/submit_groupwise_azureml_pytest.py \ - --clustername 'cluster-d3-v2' \ - --subid '12345678-9012-3456-abcd-123456789012' \ - --pr '666' \ - --reponame 'Recommenders' \ - --branch 'staging' + --subid '12345678-9012-3456-abcd-123456789012' ... """ import argparse import logging -from azureml.core.authentication import AzureCliAuthentication -from azureml.core import Workspace -from azureml.core import Experiment -from azureml.core.runconfig import RunConfiguration, DockerConfiguration -from azureml.core.conda_dependencies import CondaDependencies -from azureml.core.script_run_config import ScriptRunConfig -from azureml.core.compute import ComputeTarget, AmlCompute -from azureml.core.compute_target import ComputeTargetException -from azureml.core.workspace import WorkspaceException - - -def setup_workspace( - workspace_name, subscription_id, resource_group, cli_auth, location -): - """ - This sets up an Azure Workspace. - An existing Azure Workspace is used or a new one is created if needed for - the pytest run. - - Args: - workspace_name (str): Centralized location on Azure to work - with all the artifacts used by AzureML - service - subscription_id (str): the Azure subscription id - resource_group (str): Azure Resource Groups are logical collections of - assets associated with a project. Resource groups - make it easy to track or delete all resources - associated with a project by tracking or deleting - the Resource group. - cli_auth Azure authentication - location (str): workspace reference - - Returns: - ws: workspace reference - """ - logger.debug("setup: workspace_name is {}".format(workspace_name)) - logger.debug("setup: resource_group is {}".format(resource_group)) - logger.debug("setup: subid is {}".format(subscription_id)) - logger.debug("setup: location is {}".format(location)) - - try: - # use existing workspace if there is one - ws = Workspace.get( - name=workspace_name, - subscription_id=subscription_id, - resource_group=resource_group, - auth=cli_auth, - ) - except WorkspaceException: - # this call might take a minute or two. - logger.debug("Creating new workspace") - ws = Workspace.create( - name=workspace_name, - subscription_id=subscription_id, - resource_group=resource_group, - # create_resource_group=True, - location=location, - auth=cli_auth, - show_output=False, - ) - return ws - - -def setup_persistent_compute_target(workspace, cluster_name, vm_size, max_nodes): - """ - Set up a persistent compute target on AzureML. - A persistent compute target runs noticeably faster than a - regular compute target for subsequent runs. The benefit - is that AzureML manages turning the compute on/off as needed for - each job so the user does not need to do this. - - Args: - workspace (str): Centralized location on Azure to work with - all the - artifacts used by AzureML service - cluster_name (str): the Azure cluster for this run. It can - already exist or it will be created. - vm_size (str): Azure VM size, like STANDARD_D3_V2 - max_nodes (int): Number of VMs, max_nodes=4 will - autoscale up to 4 VMs - Returns: - cpu_cluster : cluster reference - """ - # setting vmsize and num nodes creates a persistent AzureML - # compute resource - - logger.debug("setup: cluster_name {}".format(cluster_name)) - # https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets - - try: - cpu_cluster = ComputeTarget(workspace=workspace, name=cluster_name) - logger.debug("setup: Found existing cluster, use it.") - except ComputeTargetException: - logger.debug("setup: create cluster") - compute_config = AmlCompute.provisioning_configuration( - vm_size=vm_size, - max_nodes=max_nodes, - ssh_public_access_enabled=True, - idle_time_before_scale_down=3600, # 1 hour - ) - cpu_cluster = ComputeTarget.create(workspace, cluster_name, compute_config) - cpu_cluster.wait_for_completion(show_output=False) - return cpu_cluster - - -def create_run_config( - cpu_cluster, - add_gpu_dependencies, - add_spark_dependencies, - conda_pkg_jdk, - conda_pkg_python, - commit_sha, -): - """ - AzureML requires the run environment to be setup prior to submission. - This configures a docker persistent compute. Even though - it is called Persistent compute, AzureML handles startup/shutdown - of the compute environment. - - Args: - cpu_cluster (str) : Names the cluster for the test - In the case of unit tests, any of - the following: - - Reco_cpu_test - - Reco_gpu_test - add_gpu_dependencies (bool) : True if gpu packages should be - added to the conda environment, else False - add_spark_dependencies (bool) : True if PySpark packages should be - added to the conda environment, else False - commit_sha (str) : the commit that triggers the workflow - - Return: - run_azuremlcompute : AzureML run config - """ - - run_azuremlcompute = RunConfiguration() - run_azuremlcompute.target = cpu_cluster - if not add_gpu_dependencies: - # https://github.com/Azure/AzureML-Containers/blob/master/base/cpu/openmpi4.1.0-ubuntu22.04 - run_azuremlcompute.environment.docker.base_image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04" - else: - run_azuremlcompute.environment.docker.base_image = None - # Use the latest CUDA - # See - # * https://learn.microsoft.com/en-us/azure/machine-learning/how-to-train-with-custom-image?view=azureml-api-1#use-a-custom-dockerfile-optional - # * https://github.com/Azure/AzureML-Containers/blob/master/base/gpu/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 - run_azuremlcompute.environment.docker.base_dockerfile = r""" -FROM nvcr.io/nvidia/cuda:12.3.1-devel-ubuntu22.04 -USER root:root -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -ENV DEBIAN_FRONTEND noninteractive -RUN apt-get update && \ - apt-get install -y wget git-all && \ - apt-get clean -y && \ - rm -rf /var/lib/apt/lists/* -# Conda Environment -# Pin pip=20.1.1 due to the issue: No module named 'ruamel' -# See https://learn.microsoft.com/en-us/python/api/overview/azure/ml/install?view=azure-ml-py#troubleshooting -ENV MINICONDA_VERSION py38_23.3.1-0 -ENV PATH /opt/miniconda/bin:$PATH -ENV CONDA_PACKAGE 23.5.0 -RUN wget -qO /tmp/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh && \ - bash /tmp/miniconda.sh -bf -p /opt/miniconda && \ - conda install -y conda=${CONDA_PACKAGE} python=3.8 pip=20.1.1 && \ - conda update --all -c conda-forge -y && \ - conda clean -ay && \ - rm -rf /opt/miniconda/pkgs && \ - rm /tmp/miniconda.sh && \ - find / -type d -name __pycache__ | xargs rm -rf -""" - - # Use conda_dependencies.yml to create a conda environment in - # the Docker image for execution - # False means the user will provide a conda file for setup - # True means the user will manually configure the environment - run_azuremlcompute.environment.python.user_managed_dependencies = False - - conda_dep = CondaDependencies() - conda_dep.add_conda_package(conda_pkg_python) - conda_dep.add_pip_package( - "pymanopt@https://github.com/pymanopt/pymanopt/archive/fb36a272cdeecb21992cfd9271eb82baafeb316d.zip" - ) - - # install recommenders - reco_extras = "dev" - conda_dep.add_conda_package("anaconda::git") - if add_gpu_dependencies and add_spark_dependencies: - conda_dep.add_channel("conda-forge") - conda_dep.add_conda_package(conda_pkg_jdk) - reco_extras = reco_extras + ",spark,gpu" - elif add_gpu_dependencies: - reco_extras = reco_extras + ",gpu" - elif add_spark_dependencies: - conda_dep.add_channel("conda-forge") - conda_dep.add_conda_package(conda_pkg_jdk) - reco_extras = reco_extras + ",spark" - - conda_dep.add_pip_package( - f"recommenders[{reco_extras}]@git+https://github.com/recommenders-team/recommenders.git@{commit_sha}" - ) - - run_azuremlcompute.environment.python.conda_dependencies = conda_dep - return run_azuremlcompute - - -def create_experiment(workspace, experiment_name): - """ - AzureML requires an experiment as a container of trials. - This will either create a new experiment or use an - existing one. - - Args: - workspace (str) : name of AzureML workspace - experiment_name (str) : AzureML experiment name - Return: - exp - AzureML experiment - """ - - logger.debug("create: experiment_name {}".format(experiment_name)) - exp = Experiment(workspace=workspace, name=experiment_name) - return exp - - -def submit_experiment_to_azureml( - test, run_config, experiment, test_group, test_kind, warnings -): - - """ - Submitting the experiment to AzureML actually runs the script. - - Args: - test (str): Pytest script, folder/test such as ./tests/ci/run_pytest.py - run_config (obj): Environment configuration - experiment (obj): Instance of an Experiment, a collection of - trials where each trial is a run. - test_group (str): Name of the test group. - test_kind (str): Name of the test kind, such as nightly or unit. - pytestargs (str): Pytest arguments. - - Return: - obj: AzureML run or trial - """ - - arguments = ["--testgroup", test_group, "--testkind", test_kind] - if warnings is True: - arguments.append("--disable-warnings") - - script_run_config = ScriptRunConfig( - source_directory=".", - script=test, - run_config=run_config, - docker_runtime_config=DockerConfiguration(use_docker=True), - arguments=arguments, - ) - - run = experiment.submit(script_run_config) - # waits only for configuration to complete - run.wait_for_completion(show_output=True, wait_post_processing=True) - - # test logs can also be found on azure - # go to azure portal to see log in azure ws and look for experiment name - # and look for individual run - logger.debug("files {}".format(run.get_file_names)) - - return run +from aml_utils import ( + correct_resource_name, + create_or_start_compute, + get_client, + get_or_create_environment, + run_tests, +) -def create_arg_parser(): +def parse_args(): """ Many of the argument defaults are used as arg_parser makes it easy to use defaults. The user has many options they can select. """ parser = argparse.ArgumentParser(description="Process some inputs") + parser.add_argument( "--sha", action="store", - help="the commit that triggers the workflow", + help="the commit triggering the workflow", ) - # script to run pytest parser.add_argument( - "--test", + "--script", action="store", default="tests/ci/azureml_tests/run_groupwise_pytest.py", - help="location of script to run pytest", + help="Path of script to run pytest", ) - # max num nodes in Azure cluster parser.add_argument( "--maxnodes", action="store", default=4, - help="specify the maximum number of nodes for the run", + help="Maximum number of nodes for the run", ) - # Test group parser.add_argument( - "--testgroup", action="store", default="group_criteo", help="Test Group" + "--testgroup", + action="store", + default="group_criteo", + help="Test Group", ) - # Azure resource group parser.add_argument( - "--rg", action="store", default="recommender", help="Azure Resource Group" + "--rg", + action="store", + default="recommender", + help="Azure Resource Group", ) - # AzureML workspace Name parser.add_argument( - "--wsname", action="store", default="RecoWS", help="AzureML workspace name" + "--ws", + action="store", + default="RecoWS", + help="AzureML workspace name", ) - # AzureML clustername parser.add_argument( - "--clustername", + "--cluster", action="store", default="azuremlcompute", - help="Set name of Azure cluster", + help="AzureML cluster name", ) - # Azure VM size parser.add_argument( "--vmsize", action="store", default="STANDARD_D3_V2", - help="Set the size of the VM either STANDARD_D3_V2", + help="VM size", ) - # Azure subscription id, when used in a pipeline, it is stored in keyvault parser.add_argument( - "--subid", action="store", default="123456", help="Azure Subscription ID" + "--subid", + action="store", + default="123456", + help="Azure Subscription ID", ) - # AzureML experiment name parser.add_argument( "--expname", action="store", default="persistentAzureML", - help="experiment name on Azure", + help="Experiment name on AzureML", ) - # Azure datacenter location - parser.add_argument("--location", default="EastUS", help="Azure location") - # github repo, stored in AzureML experiment for info purposes parser.add_argument( - "--reponame", + "--envname", action="store", - default="--reponame MyGithubRepo", - help="GitHub repo being tested", + default="recommenders", + help="Environment name on AzureML", ) - # github branch, stored in AzureML experiment for info purposes - parser.add_argument( - "--branch", - action="store", - default="--branch MyGithubBranch", - help=" Identify the branch test test is run on", - ) - # github pull request, stored in AzureML experiment for info purposes - parser.add_argument( - "--pr", - action="store", - default="--pr PRTestRun", - help="If a pr triggered the test, list it here", - ) - # flag to indicate whether gpu dependencies should be included in conda env - parser.add_argument( - "--add_gpu_dependencies", - action="store_true", - help="include packages for GPU support", - ) - # flag to indicate whether pyspark dependencies should be included in conda env - parser.add_argument( - "--add_spark_dependencies", - action="store_true", - help="include packages for PySpark support", - ) - # path where test logs should be downloaded - parser.add_argument( - "--testlogs", - action="store", - default="test_logs.log", - help="Test logs will be downloaded to this path", - ) - # conda package name for jdk parser.add_argument( "--conda_pkg_jdk", action="store", default="openjdk=8", - help="conda package name for jdk", + help="Conda package for JDK", ) - # conda package name for python parser.add_argument( - "--conda_pkg_python", + "--python-version", action="store", - default="python=3.7", - help="conda package for Python", + default="3.8", + help="Python version", ) parser.add_argument( "--testkind", @@ -433,73 +125,59 @@ def create_arg_parser(): default="unit", help="Test kind - nightly or unit", ) - # Flag to indicate whether to turn off the warnings - parser.add_argument( - "--disable-warnings", - action="store_true", - help="Turn off warnings", - ) - args = parser.parse_args() - return args + return parser.parse_args() if __name__ == "__main__": - logger = logging.getLogger("submit_groupwise_azureml_pytest.py") - args = create_arg_parser() - cli_auth = AzureCliAuthentication() + args = parse_args() - workspace = setup_workspace( - workspace_name=args.wsname, + logger.info(f"Setting up workspace {args.ws}") + client = get_client( subscription_id=args.subid, resource_group=args.rg, - cli_auth=cli_auth, - location=args.location, - ) - - cpu_cluster = setup_persistent_compute_target( - workspace=workspace, - cluster_name=args.clustername, - vm_size=args.vmsize, - max_nodes=args.maxnodes, - ) - - run_config = create_run_config( - cpu_cluster=cpu_cluster, - add_gpu_dependencies=args.add_gpu_dependencies, - add_spark_dependencies=args.add_spark_dependencies, + workspace_name=args.ws, + ) + + logger.info(f"Setting up compute {args.cluster}") + create_or_start_compute( + client=client, + name=args.cluster, + size=args.vmsize, + max_instances=args.maxnodes + ) + + # TODO: Unlike Azure DevOps pipelines, GitHub Actions only has simple + # string functions like startsWith() and contains(). And AzureML + # only accepts simple names that do not contain '.' and '/'. + # correct_resource_name() is used to replace '.' and '/' with '_' + # which makes names in the workflow and on AzureML inconsistent. + # For example, a name + # * in the workflow + # recommenders-unit-group_cpu_001-python3.8-c8adeafabc011b549f875dc145313ffbe3fc53a8 + # * on AzureML + # recommenders-unit-group_cpu_001-python3_8-c8adeafabc011b549f875dc145313ffbe3fc53a8 + environment_name = correct_resource_name(args.envname) + logger.info(f"Setting up environment {environment_name}") + get_or_create_environment( + client=client, + environment_name=environment_name, + use_gpu=True if "gpu" in args.testgroup else False, + use_spark=True if "spark" in args.testgroup else False, conda_pkg_jdk=args.conda_pkg_jdk, - conda_pkg_python=args.conda_pkg_python, + python_version=args.python_version, commit_sha=args.sha, ) - logger.info("exp: In Azure, look for experiment named {}".format(args.expname)) - - # create new or use existing experiment - experiment = Experiment(workspace=workspace, name=args.expname) - run = submit_experiment_to_azureml( - test=args.test, - run_config=run_config, - experiment=experiment, - test_group=args.testgroup, - test_kind=args.testkind, - warnings=args.disable_warnings, + experiment_name = correct_resource_name(args.expname) + logger.info(f"Running experiment {experiment_name}") + run_tests( + client=client, + compute=args.cluster, + environment_name=environment_name, + experiment_name=experiment_name, + script=args.script, + testgroup=args.testgroup, + testkind=args.testkind, ) - - # add helpful information to experiment on Azure - run.tag("Python", args.conda_pkg_python) - run.tag("RepoName", args.reponame) - run.tag("Branch", args.branch) - run.tag("PR", args.pr) - run.tag("script", args.test) - run.tag("testgroup", args.testgroup) - run.tag("testkind", args.testkind) - - # download logs file from AzureML - run.download_file(name="test_logs", output_file_path=args.testlogs) - - # save pytest exit code - metrics = run.get_metrics() - with open("pytest_exit_code.log", "w") as f: - f.write(str(metrics.get("pytest_exit_code"))) From b6f1031d9233a6841a9e05f78ef1a1592c82923b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Dav=C3=B3?= Date: Fri, 9 Aug 2024 08:37:33 +0000 Subject: [PATCH 30/67] Moved pymanopt tests to experimental test group #2138 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: David Davó --- tests/ci/azureml_tests/test_groups.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/ci/azureml_tests/test_groups.py b/tests/ci/azureml_tests/test_groups.py index aa2d78b6e..401603ad0 100644 --- a/tests/ci/azureml_tests/test_groups.py +++ b/tests/ci/azureml_tests/test_groups.py @@ -217,7 +217,6 @@ "tests/unit/recommenders/evaluation/test_python_evaluation.py::test_user_item_serendipity_item_feature_vector", "tests/unit/recommenders/evaluation/test_python_evaluation.py::test_user_serendipity_item_feature_vector", "tests/unit/recommenders/evaluation/test_python_evaluation.py::test_serendipity_item_feature_vector", - "tests/unit/recommenders/models/test_geoimc.py::test_imcproblem", "tests/unit/recommenders/models/test_tfidf_utils.py::test_init", "tests/unit/recommenders/models/test_tfidf_utils.py::test_clean_dataframe", "tests/unit/recommenders/models/test_tfidf_utils.py::test_fit", @@ -228,13 +227,6 @@ "tests/unit/recommenders/models/test_tfidf_utils.py::test_get_top_k_recommendations", "tests/unit/recommenders/models/test_cornac_utils.py::test_predict", "tests/unit/recommenders/models/test_cornac_utils.py::test_recommend_k_items", - "tests/unit/recommenders/models/test_geoimc.py::test_dataptr", - "tests/unit/recommenders/models/test_geoimc.py::test_length_normalize", - "tests/unit/recommenders/models/test_geoimc.py::test_mean_center", - "tests/unit/recommenders/models/test_geoimc.py::test_reduce_dims", - "tests/unit/recommenders/models/test_geoimc.py::test_imcproblem", - "tests/unit/recommenders/models/test_geoimc.py::test_inferer_init", - "tests/unit/recommenders/models/test_geoimc.py::test_inferer_infer", "tests/unit/recommenders/models/test_sar_singlenode.py::test_init", "tests/unit/recommenders/models/test_sar_singlenode.py::test_fit", "tests/unit/recommenders/models/test_sar_singlenode.py::test_predict", @@ -307,7 +299,6 @@ "tests/integration/recommenders/utils/test_k8s_utils.py", ], "group_notebooks_cpu_001": [ # Total group time: 226.42s - "tests/unit/examples/test_notebooks_python.py::test_rlrmc_quickstart_runs", "tests/unit/examples/test_notebooks_python.py::test_sar_deep_dive_runs", "tests/unit/examples/test_notebooks_python.py::test_baseline_deep_dive_runs", "tests/unit/examples/test_notebooks_python.py::test_template_runs", @@ -456,5 +447,14 @@ "tests/unit/recommenders/models/test_lightfm_utils.py::test_sim_users", "tests/unit/recommenders/models/test_lightfm_utils.py::test_sim_items", "tests/functional/examples/test_notebooks_python.py::test_lightfm_functional", + "tests/unit/recommenders/models/test_geoimc.py::test_imcproblem", + "tests/unit/recommenders/models/test_geoimc.py::test_dataptr", + "tests/unit/recommenders/models/test_geoimc.py::test_length_normalize", + "tests/unit/recommenders/models/test_geoimc.py::test_mean_center", + "tests/unit/recommenders/models/test_geoimc.py::test_reduce_dims", + "tests/unit/recommenders/models/test_geoimc.py::test_imcproblem", + "tests/unit/recommenders/models/test_geoimc.py::test_inferer_init", + "tests/unit/recommenders/models/test_geoimc.py::test_inferer_infer", + "tests/unit/examples/test_notebooks_python.py::test_rlrmc_quickstart_runs", ] } From 220f341216bde44ad853bf233d995b7b9fb97614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Dav=C3=B3?= Date: Fri, 9 Aug 2024 08:40:56 +0000 Subject: [PATCH 31/67] Removed pymanopt install from azure tests Its only needed for the experimental tests groups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: David Davó --- tests/ci/azureml_tests/aml_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ci/azureml_tests/aml_utils.py b/tests/ci/azureml_tests/aml_utils.py index d24ec1361..5a4d488e3 100644 --- a/tests/ci/azureml_tests/aml_utils.py +++ b/tests/ci/azureml_tests/aml_utils.py @@ -92,7 +92,6 @@ def get_or_create_environment( - {conda_pkg_jdk} - pip - pip: - - pymanopt@https://github.com/pymanopt/pymanopt/archive/fb36a272cdeecb21992cfd9271eb82baafeb316d.zip - recommenders[dev{",gpu" if use_gpu else ""}{",spark" if use_spark else ""}]@git+https://github.com/recommenders-team/recommenders.git@{commit_sha} """ # See https://github.com/Azure/AzureML-Containers/blob/master/base/cpu/openmpi4.1.0-ubuntu22.04 From 841d8ef6d81012ed0c0af66d00ab10cacda95897 Mon Sep 17 00:00:00 2001 From: siyerp <112425790+siyerp@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:58:12 -0400 Subject: [PATCH 32/67] Signed-off-by: Subramanian Iyer Changed a function call name to reflect a more recent version of tensorflow. --- recommenders/models/sasrec/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommenders/models/sasrec/model.py b/recommenders/models/sasrec/model.py index 4ac6fa93d..413a4cf0e 100644 --- a/recommenders/models/sasrec/model.py +++ b/recommenders/models/sasrec/model.py @@ -689,7 +689,7 @@ def train_step(inp, tar): for epoch in range(1, num_epochs + 1): step_loss = [] - train_loss.reset_states() + train_loss.reset_state() for step in tqdm( range(num_steps), total=num_steps, ncols=70, leave=False, unit="b" ): From 852956eb8f0739a96230c15bb3a5beb1b104e6fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Dav=C3=B3?= Date: Tue, 13 Aug 2024 14:25:10 +0000 Subject: [PATCH 33/67] Marked lightfm tests as experimental MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: David Davó --- .../recommenders/models/test_lightfm_utils.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tests/unit/recommenders/models/test_lightfm_utils.py b/tests/unit/recommenders/models/test_lightfm_utils.py index 2155fb655..62b9d2ccc 100644 --- a/tests/unit/recommenders/models/test_lightfm_utils.py +++ b/tests/unit/recommenders/models/test_lightfm_utils.py @@ -6,14 +6,17 @@ import itertools import numpy as np import pandas as pd -from lightfm.data import Dataset -from lightfm import LightFM, cross_validation -from recommenders.models.lightfm.lightfm_utils import ( - track_model_metrics, - similar_users, - similar_items, -) +try: + from lightfm.data import Dataset + from lightfm import LightFM, cross_validation + from recommenders.models.lightfm.lightfm_utils import ( + track_model_metrics, + similar_users, + similar_items, + ) +except ModuleNotFoundError: + pass SEEDNO = 42 @@ -128,6 +131,7 @@ def sim_items(interactions, fitting): ) +@pytest.mark.experimental def test_interactions(interactions): train_interactions, test_interactions, item_features, user_features = interactions assert train_interactions.shape == (10, 10) @@ -136,6 +140,7 @@ def test_interactions(interactions): assert user_features.shape == (10, 17) +@pytest.mark.experimental @pytest.mark.skip(reason="Flaky test") def test_fitting(fitting): output, _ = fitting @@ -152,9 +157,11 @@ def test_fitting(fitting): np.testing.assert_array_equal(output, target) +@pytest.mark.experimental def test_sim_users(sim_users): assert sim_users.shape == (5, 2) +@pytest.mark.experimental def test_sim_items(sim_items): assert sim_items.shape == (5, 2) From 5253623b93453dfae113904ab26e5bf26de88b4d Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 13 Aug 2024 19:31:55 +0200 Subject: [PATCH 34/67] Update MIND URL Signed-off-by: miguelgfierro --- recommenders/datasets/mind.py | 8 ++++---- recommenders/models/newsrec/newsrec_utils.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/recommenders/datasets/mind.py b/recommenders/datasets/mind.py index f396044c6..23b7a8db2 100644 --- a/recommenders/datasets/mind.py +++ b/recommenders/datasets/mind.py @@ -18,16 +18,16 @@ URL_MIND_LARGE_TRAIN = ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip" + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip" ) URL_MIND_LARGE_VALID = ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip" + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip" ) URL_MIND_SMALL_TRAIN = ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip" + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip" ) URL_MIND_SMALL_VALID = ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip" + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip" ) URL_MIND_DEMO_TRAIN = ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip" diff --git a/recommenders/models/newsrec/newsrec_utils.py b/recommenders/models/newsrec/newsrec_utils.py index 429f24b83..05deeef5d 100644 --- a/recommenders/models/newsrec/newsrec_utils.py +++ b/recommenders/models/newsrec/newsrec_utils.py @@ -310,7 +310,7 @@ def get_mind_data_set(type): if type == "large": return ( - "https://mind201910small.blob.core.windows.net/release/", + "https://recodatasets.z20.web.core.windows.net/newsrec", "MINDlarge_train.zip", "MINDlarge_dev.zip", "MINDlarge_utils.zip", @@ -318,7 +318,7 @@ def get_mind_data_set(type): elif type == "small": return ( - "https://mind201910small.blob.core.windows.net/release/", + "https://recodatasets.z20.web.core.windows.net/newsrec/", "MINDsmall_train.zip", "MINDsmall_dev.zip", "MINDsmall_utils.zip", From 7179a8f60eeb37a20ccbbd934d72cd6f55369328 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 13 Aug 2024 19:34:37 +0200 Subject: [PATCH 35/67] :bug: Signed-off-by: miguelgfierro --- .../recommenders/datasets/test_mind.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index d4f5f8c1f..c55d8c6f2 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -27,32 +27,33 @@ '"0x8D8B8AD5B126C3B"', ), ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip", "52952752", "0x8D834F2EB31BDEC", ), ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", "30945572", "0x8D834F2EBA8D865", ), ( - "https://mind201910small.blob.core.windows.net/release/MINDsmall_utils.zip", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip", "155178106", "0x8D87F67F4AEB960", ), + # FIXME: Issue #2133 + # ( + # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", + # "530196631", + # "0x8D8244E90C15C07", + # ), + # ( + # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", + # "103456245", + # "0x8D8244E92005849", + # ), ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip", - "530196631", - "0x8D8244E90C15C07", - ), - ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip", - "103456245", - "0x8D8244E92005849", - ), - ( - "https://mind201910small.blob.core.windows.net/release/MINDlarge_utils.zip", + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", "150359301", "0x8D87F67E6CA4364", ), From 98915e18efe264cb93c39871db48521cee362a8c Mon Sep 17 00:00:00 2001 From: siyerp <112425790+siyerp@users.noreply.github.com> Date: Wed, 14 Aug 2024 21:05:05 -0400 Subject: [PATCH 36/67] specified keyword arguments as per updated requirements - subramanian iyer --- recommenders/models/sasrec/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommenders/models/sasrec/model.py b/recommenders/models/sasrec/model.py index 4ac6fa93d..26d3496ef 100644 --- a/recommenders/models/sasrec/model.py +++ b/recommenders/models/sasrec/model.py @@ -313,7 +313,7 @@ def call(self, x, training, mask): """ for i in range(self.num_layers): - x = self.enc_layers[i](x, training, mask) + x = self.enc_layers[i](x, training=training, mask=mask) return x # (batch_size, input_seq_len, d_model) From 0296e95635e5807a871cbc5752755257c7396d83 Mon Sep 17 00:00:00 2001 From: siyerp <112425790+siyerp@users.noreply.github.com> Date: Wed, 14 Aug 2024 21:11:58 -0400 Subject: [PATCH 37/67] Update model.py Corrected docstring --- recommenders/models/sasrec/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recommenders/models/sasrec/model.py b/recommenders/models/sasrec/model.py index 26d3496ef..1550cfd1f 100644 --- a/recommenders/models/sasrec/model.py +++ b/recommenders/models/sasrec/model.py @@ -240,7 +240,7 @@ def call(self, x, training, mask): Args: x (tf.Tensor): Input tensor. - training (tf.Tensor): Training tensor. + training (Boolean): True if in training mode. mask (tf.Tensor): Mask tensor. Returns: @@ -305,7 +305,7 @@ def call(self, x, training, mask): Args: x (tf.Tensor): Input tensor. - training (tf.Tensor): Training tensor. + training (Boolean): True if in training mode. mask (tf.Tensor): Mask tensor. Returns: From 551ec31b8e3d2a163e72ad8570c0323bc865907f Mon Sep 17 00:00:00 2001 From: siyerp <112425790+siyerp@users.noreply.github.com> Date: Wed, 14 Aug 2024 21:30:25 -0400 Subject: [PATCH 38/67] Changed function signatures to comply with updated tensorflow requirements. -Subramanian Iyer --- recommenders/models/sasrec/ssept.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recommenders/models/sasrec/ssept.py b/recommenders/models/sasrec/ssept.py index dbf7abdce..15da43082 100644 --- a/recommenders/models/sasrec/ssept.py +++ b/recommenders/models/sasrec/ssept.py @@ -122,7 +122,7 @@ def call(self, x, training): # --- ATTENTION BLOCKS --- seq_attention = seq_embeddings # (b, s, h1 + h2) - seq_attention = self.encoder(seq_attention, training, mask) + seq_attention = self.encoder(seq_attention, training=training, mask=mask) seq_attention = self.layer_normalization(seq_attention) # (b, s, h1+h2) # --- PREDICTION LAYER --- @@ -197,7 +197,7 @@ def predict(self, inputs): seq_embeddings *= mask seq_attention = seq_embeddings - seq_attention = self.encoder(seq_attention, training, mask) + seq_attention = self.encoder(seq_attention, training=training, mask=mask) seq_attention = self.layer_normalization(seq_attention) # (b, s, h1+h2) seq_emb = tf.reshape( seq_attention, From 40dc7c8945a5614e7c99a89e2dab077f1965b412 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 16 Aug 2024 17:42:12 +0200 Subject: [PATCH 39/67] MIND large Signed-off-by: miguelgfierro --- .../recommenders/datasets/test_mind.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index c55d8c6f2..558801ca4 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -41,17 +41,16 @@ "155178106", "0x8D87F67F4AEB960", ), - # FIXME: Issue #2133 - # ( - # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", - # "530196631", - # "0x8D8244E90C15C07", - # ), - # ( - # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", - # "103456245", - # "0x8D8244E92005849", - # ), + ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", + "530196631", + "0x8D8244E90C15C07", + ), + ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", + "103456245", + "0x8D8244E92005849", + ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", "150359301", From cde942e52b2974f52090a28910d2c141793b5d42 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 16 Aug 2024 19:03:13 +0200 Subject: [PATCH 40/67] :bug: Signed-off-by: miguelgfierro --- recommenders/models/newsrec/newsrec_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommenders/models/newsrec/newsrec_utils.py b/recommenders/models/newsrec/newsrec_utils.py index 05deeef5d..48e1ce8f3 100644 --- a/recommenders/models/newsrec/newsrec_utils.py +++ b/recommenders/models/newsrec/newsrec_utils.py @@ -310,7 +310,7 @@ def get_mind_data_set(type): if type == "large": return ( - "https://recodatasets.z20.web.core.windows.net/newsrec", + "https://recodatasets.z20.web.core.windows.net/newsrec/", "MINDlarge_train.zip", "MINDlarge_dev.zip", "MINDlarge_utils.zip", From 608f6848c7808f67e3e937cabf92aff6ec798b6e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 20 Aug 2024 08:28:10 +0200 Subject: [PATCH 41/67] Updates etags and size Signed-off-by: miguelgfierro --- .../recommenders/datasets/test_mind.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index 558801ca4..201b59d6a 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -28,12 +28,12 @@ ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip", - "52952752", + "52953372", "0x8D834F2EB31BDEC", ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", - "30945572", + "30946172", "0x8D834F2EBA8D865", ), ( @@ -43,18 +43,18 @@ ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", - "530196631", + "531361237", "0x8D8244E90C15C07", ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", - "103456245", + "103593383", "0x8D8244E92005849", ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", "150359301", - "0x8D87F67E6CA4364", + "0x8D8B8AD5B2ED4C9", ), ], ) @@ -75,9 +75,9 @@ def test_download_mind_demo(tmp): def test_download_mind_small(tmp): train_path, valid_path = download_mind(size="small", dest_path=tmp) statinfo = os.stat(train_path) - assert statinfo.st_size == 52952752 + assert statinfo.st_size == 52953372 statinfo = os.stat(valid_path) - assert statinfo.st_size == 30945572 + assert statinfo.st_size == 30946172 def test_extract_mind_demo(tmp): @@ -127,9 +127,9 @@ def test_extract_mind_small(tmp): def test_download_mind_large(tmp_path): train_path, valid_path = download_mind(size="large", dest_path=tmp_path) statinfo = os.stat(train_path) - assert statinfo.st_size == 530196631 + assert statinfo.st_size == 531361237 statinfo = os.stat(valid_path) - assert statinfo.st_size == 103456245 + assert statinfo.st_size == 103593383 def test_extract_mind_large(tmp): From 5fa199d43303f48b712cf85ea66c47f1fccc45df Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 26 Aug 2024 10:03:00 +0200 Subject: [PATCH 42/67] :bug: Signed-off-by: miguelgfierro --- .../recommenders/datasets/test_mind.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index 201b59d6a..07995ad9c 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -29,33 +29,33 @@ ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip", "52953372", - "0x8D834F2EB31BDEC", - ), - ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", - "30946172", - "0x8D834F2EBA8D865", - ), - ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip", - "155178106", - "0x8D87F67F4AEB960", - ), - ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", - "531361237", - "0x8D8244E90C15C07", - ), - ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", - "103593383", - "0x8D8244E92005849", - ), - ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", - "150359301", - "0x8D8B8AD5B2ED4C9", + '"0x8DCBBBBA40EECC6"', ), + # ( + # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", + # "30946172", + # "0x8D834F2EBA8D865", + # ), + # ( + # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip", + # "155178106", + # "0x8D87F67F4AEB960", + # ), + # ( + # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", + # "531361237", + # "0x8D8244E90C15C07", + # ), + # ( + # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", + # "103593383", + # "0x8D8244E92005849", + # ), + # ( + # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", + # "150359301", + # "0x8D8B8AD5B2ED4C9", + # ), ], ) def test_mind_url(url, content_length, etag): From d1afac5feeaf2d1e04ced17a2ae18fb9a3897c56 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 26 Aug 2024 10:03:28 +0200 Subject: [PATCH 43/67] :bug: Signed-off-by: miguelgfierro --- .../recommenders/datasets/test_mind.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index 07995ad9c..3a8f65130 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -31,31 +31,31 @@ "52953372", '"0x8DCBBBBA40EECC6"', ), - # ( - # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", - # "30946172", - # "0x8D834F2EBA8D865", - # ), - # ( - # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip", - # "155178106", - # "0x8D87F67F4AEB960", - # ), - # ( - # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", - # "531361237", - # "0x8D8244E90C15C07", - # ), - # ( - # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", - # "103593383", - # "0x8D8244E92005849", - # ), - # ( - # "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", - # "150359301", - # "0x8D8B8AD5B2ED4C9", - # ), + ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", + "30946172", + '"0x8D834F2EBA8D865"', + ), + ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip", + "155178106", + '"0x8D87F67F4AEB960"', + ), + ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", + "531361237", + '"0x8D8244E90C15C07"', + ), + ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", + "103593383", + '"0x8D8244E92005849"', + ), + ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", + "150359301", + '"0x8D8B8AD5B2ED4C9"', + ), ], ) def test_mind_url(url, content_length, etag): From a15a2e278d92a747694f1a0e7ec845dbe3ed372b Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 26 Aug 2024 10:04:35 +0200 Subject: [PATCH 44/67] test_mind_url Signed-off-by: miguelgfierro --- tests/data_validation/recommenders/datasets/test_mind.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index 3a8f65130..412e7a81a 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -34,22 +34,22 @@ ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", "30946172", - '"0x8D834F2EBA8D865"', + '"0x8DCBBBB989916FE"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip", "155178106", - '"0x8D87F67F4AEB960"', + '"0x8D8B8AD5B3677C6"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", "531361237", - '"0x8D8244E90C15C07"', + '"0x8DCBE08E04726C1"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", "103593383", - '"0x8D8244E92005849"', + '"0x8DCBE0865CB161F"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", From 60eaeb961399e3bc6d121a54b34d03d12d85ef51 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 26 Aug 2024 10:08:01 +0200 Subject: [PATCH 45/67] reorder Signed-off-by: miguelgfierro --- .../recommenders/datasets/test_mind.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index 412e7a81a..2e26e727a 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -72,14 +72,6 @@ def test_download_mind_demo(tmp): assert statinfo.st_size == 10080022 -def test_download_mind_small(tmp): - train_path, valid_path = download_mind(size="small", dest_path=tmp) - statinfo = os.stat(train_path) - assert statinfo.st_size == 52953372 - statinfo = os.stat(valid_path) - assert statinfo.st_size == 30946172 - - def test_extract_mind_demo(tmp): train_zip, valid_zip = download_mind(size="demo", dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip, clean_zip_file=False) @@ -102,6 +94,14 @@ def test_extract_mind_demo(tmp): assert statinfo.st_size == 1044588 +def test_download_mind_small(tmp): + train_path, valid_path = download_mind(size="small", dest_path=tmp) + statinfo = os.stat(train_path) + assert statinfo.st_size == 52953372 + statinfo = os.stat(valid_path) + assert statinfo.st_size == 30946172 + + def test_extract_mind_small(tmp): train_zip, valid_zip = download_mind(size="small", dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip, clean_zip_file=False) From 065038459d69b2a76929df2720b159d9a83979e9 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 26 Aug 2024 10:23:36 +0200 Subject: [PATCH 46/67] Uploaded new zip Signed-off-by: miguelgfierro --- tests/data_validation/recommenders/datasets/test_mind.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index 2e26e727a..397104fdc 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -28,12 +28,12 @@ ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip", - "52953372", + "52994575", '"0x8DCBBBBA40EECC6"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", - "30946172", + "30948560", '"0x8DCBBBB989916FE"', ), ( @@ -43,12 +43,12 @@ ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", - "531361237", + "531360717", '"0x8DCBE08E04726C1"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", - "103593383", + "103592887", '"0x8DCBE0865CB161F"', ), ( From 078c1d3f154abbfa098b5c13dd5a81f6021a93b6 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 26 Aug 2024 10:24:20 +0200 Subject: [PATCH 47/67] Uploaded new zip Signed-off-by: miguelgfierro --- tests/data_validation/recommenders/datasets/test_mind.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index 397104fdc..24f996a9d 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -29,12 +29,12 @@ ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip", "52994575", - '"0x8DCBBBBA40EECC6"', + '"0x8DCC5A830190676"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip", "30948560", - '"0x8DCBBBB989916FE"', + '"0x8DCC5A82E182A0F"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip", @@ -44,12 +44,12 @@ ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip", "531360717", - '"0x8DCBE08E04726C1"', + '"0x8DCC5A8375BDC1D"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip", "103592887", - '"0x8DCBE0865CB161F"', + '"0x8DCC5A82FE8609C"', ), ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip", From fae2030f6493064e5c14a24f97ca29a7d0ae9720 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 26 Aug 2024 10:43:40 +0200 Subject: [PATCH 48/67] Fixing MIND large Signed-off-by: miguelgfierro --- tests/data_validation/recommenders/datasets/test_mind.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/data_validation/recommenders/datasets/test_mind.py b/tests/data_validation/recommenders/datasets/test_mind.py index 24f996a9d..8d835ad9b 100644 --- a/tests/data_validation/recommenders/datasets/test_mind.py +++ b/tests/data_validation/recommenders/datasets/test_mind.py @@ -97,9 +97,9 @@ def test_extract_mind_demo(tmp): def test_download_mind_small(tmp): train_path, valid_path = download_mind(size="small", dest_path=tmp) statinfo = os.stat(train_path) - assert statinfo.st_size == 52953372 + assert statinfo.st_size == 52994575 statinfo = os.stat(valid_path) - assert statinfo.st_size == 30946172 + assert statinfo.st_size == 30948560 def test_extract_mind_small(tmp): @@ -127,9 +127,9 @@ def test_extract_mind_small(tmp): def test_download_mind_large(tmp_path): train_path, valid_path = download_mind(size="large", dest_path=tmp_path) statinfo = os.stat(train_path) - assert statinfo.st_size == 531361237 + assert statinfo.st_size == 531360717 statinfo = os.stat(valid_path) - assert statinfo.st_size == 103593383 + assert statinfo.st_size == 103592887 def test_extract_mind_large(tmp): From d0a4af3ae6ad8af47c0ec525c7681fd20e6175ab Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Tue, 27 Aug 2024 18:45:07 +0800 Subject: [PATCH 49/67] Update dev container (#2157) * Update dev contaienr configuration Signed-off-by: Simon Zhao * Use conda to manage environments Signed-off-by: Simon Zhao * Set Python interpreter Signed-off-by: Simon Zhao * Update Signed-off-by: Simon Zhao * Add machine specs Signed-off-by: Simon Zhao --------- Signed-off-by: Simon Zhao --- .devcontainer/Dockerfile | 28 ----------- .devcontainer/devcontainer.json | 88 ++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 69 deletions(-) delete mode 100644 .devcontainer/Dockerfile diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile deleted file mode 100644 index 02ad5a0e3..000000000 --- a/.devcontainer/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -ARG PYTHON_VERSION -FROM mcr.microsoft.com/vscode/devcontainers/python:${PYTHON_VERSION} - -ARG REMOTE_USER -ENV HOME="/home/${REMOTE_USER}" \ - JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" \ - PYSPARK_PYTHON="/usr/local/bin/python" \ - PYSPARK_DRIVER_PYTHON="/usr/local/bin/python" - -RUN apt-get update && \ - apt-get -y install --no-install-recommends software-properties-common && \ - apt-add-repository 'deb http://security.debian.org/debian-security stretch/updates main' && \ - apt-get update && \ - apt-get -y install --no-install-recommends \ - openjdk-8-jre \ - cmake - -# Switch to non-root user -USER ${REMOTE_USER} -WORKDIR ${HOME} - -# Setup Jupyter Notebook -ENV NOTEBOOK_CONFIG="${HOME}/.jupyter/jupyter_notebook_config.py" -RUN mkdir -p $(dirname ${NOTEBOOK_CONFIG}) && \ - echo "c.NotebookApp.ip='0.0.0.0'" >> ${NOTEBOOK_CONFIG} && \ - echo "c.NotebookApp.open_browser=False" >> ${NOTEBOOK_CONFIG} && \ - echo "c.NotebookApp.allow_origin='*'" >> ${NOTEBOOK_CONFIG} -EXPOSE 8888 diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4b74a526c..12d6ed822 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,44 +1,50 @@ { - "name": "Recommenders", - "build": { - "dockerfile": "Dockerfile", - "context": "..", - "args": { - // Python version: 3, 3.6, 3.7 - "PYTHON_VERSION": "3.7", - "REMOTE_USER": "vscode" - } - }, + "name": "Recommenders", + // Version list: https://github.com/devcontainers/images/tree/main/src/base-ubuntu + // Includes: curl, wget, ca-certificates, git, Oh My Zsh!, + "image": "mcr.microsoft.com/devcontainers/base:ubuntu-24.04", + "hostRequirements": { + "cpus": 4, + "memory": "16gb", + "storage": "32gb" + }, + "features": { + // https://github.com/devcontainers/features/blob/main/src/anaconda/devcontainer-feature.json + "ghcr.io/devcontainers/features/anaconda:1": { + "version": "2024.06-1" + } + }, + "customizations": { + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + } + }, + "isort.args": ["--profile", "black"], + "python.analysis.autoImportCompletions": true, + "python.defaultInterpreterPath": "/usr/local/conda/envs/Recommenders/bin/python", + "python.testing.pytestEnabled": true, + // set the directory where all tests are + "python.testing.pytestArgs": ["tests"] + }, + // Add the IDs of extensions you want installed when the container is created. + "extensions": [ + "ms-python.black-formatter", // https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter + "ms-python.isort", // https://marketplace.visualstudio.com/items?itemName=ms-python.isort + "ms-python.mypy-type-checker", // https://marketplace.visualstudio.com/items?itemName=ms-python.mypy-type-checker + "ms-python.pylint", // https://marketplace.visualstudio.com/items?itemName=ms-python.pylint + "ms-python.python", // https://marketplace.visualstudio.com/items?itemName=ms-python.python + "ms-toolsai.datawrangler", // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.datawrangler + "ms-toolsai.jupyter" // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter + ] + } + }, - // Set *default* container specific settings.json values on container create. - "settings": { - "python.pythonPath": "/usr/local/bin/python", - "python.languageServer": "Pylance", - "python.linting.enabled": true, - "python.linting.pylintEnabled": true, - "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", - "python.formatting.blackPath": "/usr/local/py-utils/bin/black", - "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", - "python.linting.banditPath": "/usr/local/py-utils/bin/bandit", - "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8", - "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", - "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", - "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", - "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint" - }, - - // Add the IDs of extensions you want installed when the container is created. - "extensions": [ - "ms-python.python", - "ms-python.vscode-pylance" - ], - - // Use 'forwardPorts' to make a list of ports inside the container available locally. - "forwardPorts": [8888], - - // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "pip install -U pip && pip install --user -e .[dev,examples,spark,xlearn]", - - // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. - "remoteUser": "vscode" + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "conda create -n Recommenders -c conda-forge -y python=3.10 openjdk=21 pip && conda init bash && bash -c -i 'conda activate Recommenders && pip install -e .[dev,spark]' && conda config --set auto_activate_base false" } From ba8b24c44b481960a2229600285c4e3b82aa2f9a Mon Sep 17 00:00:00 2001 From: aaronpal Date: Tue, 27 Aug 2024 20:33:12 +0800 Subject: [PATCH 50/67] Correct variable used in pickle dump in `mind_utils.ipynb` Fixed an issue where the incorrect variable `word_dict` was being dumped to word_dict_all.pkl instead of `word_dict_all` in the `mind_utils.ipynb` notebook. --- examples/01_prepare_data/mind_utils.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/01_prepare_data/mind_utils.ipynb b/examples/01_prepare_data/mind_utils.ipynb index e03a3683d..7a2d81e6e 100644 --- a/examples/01_prepare_data/mind_utils.ipynb +++ b/examples/01_prepare_data/mind_utils.ipynb @@ -306,7 +306,7 @@ " pickle.dump(word_dict, f)\n", " \n", "with open(os.path.join(output_path, 'word_dict_all.pkl'), 'wb') as f:\n", - " pickle.dump(word_dict, f)" + " pickle.dump(word_dict_all, f)" ] }, { From 1eb6619e7d78a8e4b9ac1750ef7ae6c61219af66 Mon Sep 17 00:00:00 2001 From: aaronpal Date: Tue, 27 Aug 2024 20:33:12 +0800 Subject: [PATCH 51/67] Correct variable used in pickle dump in `mind_utils.ipynb` Fixed an issue where the incorrect variable `word_dict` was being dumped to word_dict_all.pkl instead of `word_dict_all` in the `mind_utils.ipynb` notebook. Signed-off-by: aaron --- AUTHORS.md | 2 ++ examples/01_prepare_data/mind_utils.ipynb | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/AUTHORS.md b/AUTHORS.md index 1816f73e2..b70bfa644 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -52,6 +52,8 @@ To contributors: please add your name to the list when you submit a patch to the * **[Aaron He](https://github.com/AaronHeee)** * Reco utils of NCF * Deep dive notebook demonstrating the use of NCF +* **[Aaron Palpallatoc](https://github.com/ubergonmx)** + * Corrected variable in pickle dump in `mind_utils.ipynb` notebook * **[Abir Chakraborty](https://github.com/aeroabir)** * Self-Attentive Sequential Recommendation (SASRec) * Sequential Recommendation Via Personalized Transformer (SSEPT) diff --git a/examples/01_prepare_data/mind_utils.ipynb b/examples/01_prepare_data/mind_utils.ipynb index e03a3683d..7a2d81e6e 100644 --- a/examples/01_prepare_data/mind_utils.ipynb +++ b/examples/01_prepare_data/mind_utils.ipynb @@ -306,7 +306,7 @@ " pickle.dump(word_dict, f)\n", " \n", "with open(os.path.join(output_path, 'word_dict_all.pkl'), 'wb') as f:\n", - " pickle.dump(word_dict, f)" + " pickle.dump(word_dict_all, f)" ] }, { From baca6cf05cbee2ff81b7ae19ac6fceab6856c236 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Wed, 28 Aug 2024 17:02:30 +0200 Subject: [PATCH 52/67] Update action.yml --- .github/actions/get-test-groups/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/get-test-groups/action.yml b/.github/actions/get-test-groups/action.yml index dc50e4b93..6e87da900 100644 --- a/.github/actions/get-test-groups/action.yml +++ b/.github/actions/get-test-groups/action.yml @@ -8,7 +8,7 @@ description: "Get test group names from tests_groups.py" inputs: TEST_KIND: required: true - description: Type of test - unit or nightly + description: Type of test - pr gate or nightly TEST_ENV: required: false description: Test environment - cpu, gpu or spark From 610e66346300ed87fc0f12a06cd9160a4aea5956 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 28 Aug 2024 19:57:27 +0200 Subject: [PATCH 53/67] Added extra MIND urls Signed-off-by: miguelgfierro --- recommenders/datasets/mind.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/recommenders/datasets/mind.py b/recommenders/datasets/mind.py index 23b7a8db2..7295786c2 100644 --- a/recommenders/datasets/mind.py +++ b/recommenders/datasets/mind.py @@ -17,26 +17,37 @@ ) -URL_MIND_LARGE_TRAIN = ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip" +URL_MIND_DEMO_TRAIN = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip" ) -URL_MIND_LARGE_VALID = ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip" +URL_MIND_DEMO_VALID = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip" +) +URL_MIND_DEMO_UTILS = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip" ) + URL_MIND_SMALL_TRAIN = ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_train.zip" ) URL_MIND_SMALL_VALID = ( "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_dev.zip" ) -URL_MIND_DEMO_TRAIN = ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip" +URL_MIND_SMALL_UTILS = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDsmall_utils.zip" ) -URL_MIND_DEMO_VALID = ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip" + +URL_MIND_LARGE_TRAIN = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_train.zip" ) -URL_MIND_DEMO_UTILS = ( - "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip" +URL_MIND_LARGE_VALID = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_dev.zip" +) +URL_MIND_LARGE_TEST = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_test.zip" +) +URL_MIND_LARGE_UTILS = ( + "https://recodatasets.z20.web.core.windows.net/newsrec/MINDlarge_utils.zip" ) URL_MIND = { From b049091d8dc236143018b2e0702b4988c5286f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Dav=C3=B3?= Date: Fri, 6 Sep 2024 07:53:41 +0000 Subject: [PATCH 54/67] Added assert to avoid infinite loop in negative sampling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: David Davó --- recommenders/models/deeprec/DataModel/ImplicitCF.py | 1 + 1 file changed, 1 insertion(+) diff --git a/recommenders/models/deeprec/DataModel/ImplicitCF.py b/recommenders/models/deeprec/DataModel/ImplicitCF.py index 3cfbb2821..5a9174312 100644 --- a/recommenders/models/deeprec/DataModel/ImplicitCF.py +++ b/recommenders/models/deeprec/DataModel/ImplicitCF.py @@ -206,6 +206,7 @@ def train_loader(self, batch_size): """ def sample_neg(x): + assert len(x) < self.n_items, "A user has voted in every item. Can't find a negative sample" while True: neg_id = random.randint(0, self.n_items - 1) if neg_id not in x: From 84497f23d588159fc5788e8b13e72bbec4ae38dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Dav=C3=B3?= Date: Mon, 9 Sep 2024 07:56:02 +0000 Subject: [PATCH 55/67] Changed assert to ValueError MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: David Davó --- recommenders/models/deeprec/DataModel/ImplicitCF.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/recommenders/models/deeprec/DataModel/ImplicitCF.py b/recommenders/models/deeprec/DataModel/ImplicitCF.py index 5a9174312..42bb319c4 100644 --- a/recommenders/models/deeprec/DataModel/ImplicitCF.py +++ b/recommenders/models/deeprec/DataModel/ImplicitCF.py @@ -206,7 +206,8 @@ def train_loader(self, batch_size): """ def sample_neg(x): - assert len(x) < self.n_items, "A user has voted in every item. Can't find a negative sample" + if len(x) >= self.n_items: + raise ValueError("A user has voted in every item. Can't find a negative sample.") while True: neg_id = random.randint(0, self.n_items - 1) if neg_id not in x: From c6ff09acd01f73f780b99c5a4aedbcd0dae07e49 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 20 Sep 2024 12:43:50 +0200 Subject: [PATCH 56/67] Update service principal role to AzureML Compute Operator for improved security Signed-off-by: miguelgfierro --- tests/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/README.md b/tests/README.md index df8e3e96d..19a718d2e 100644 --- a/tests/README.md +++ b/tests/README.md @@ -222,7 +222,7 @@ Then, follow the steps below to create the AzureML infrastructure: 3. Add the subscription ID to GitHub action secrets [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions). Create a new repository secret called `AZUREML_TEST_SUBID` and add the subscription ID as the value. 4. Make sure you have installed [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli), and that you are logged in: `az login`. 5. Select your subscription: `az account set -s $AZURE_SUBSCRIPTION_ID`. -6. Create a Service Principal: `az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME --role contributor --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --json-auth`. This will output a JSON blob with the credentials of the Service Principal: +6. Create a Service Principal: `az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME --role "AzureML Compute Operator" --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --json-auth`. This will output a JSON blob with the credentials of the Service Principal: ``` { "clientId": "XXXXXXXXXXXXXXXXXXXXX", From 88d9bf1c85e5beb5c1b66dc16fb378954c819593 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Mon, 23 Sep 2024 16:59:17 +0200 Subject: [PATCH 57/67] New roles Signed-off-by: miguelgfierro --- tests/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/README.md b/tests/README.md index 19a718d2e..4958cedd2 100644 --- a/tests/README.md +++ b/tests/README.md @@ -237,7 +237,9 @@ Then, follow the steps below to create the AzureML infrastructure: "managementEndpointUrl": "https://management.core.windows.net/" } ``` -7. Add the output as github's action secret `AZUREML_TEST_CREDENTIALS` under repository's **Settings > Security > Secrets and variables > Actions**. +7. Assign AzureML Data Scientist role: `az role assignment create --assignee $SERVICE_PRINCIPAL_NAME --role "AzureML Data Scientist" --scope /subscriptions/$AZURE_SUBSCRIPTION_ID` +8. Assign Reader role: `az role assignment create --assignee $SERVICE_PRINCIPAL_NAME --role "Reader" --scope /subscriptions/$AZURE_SUBSCRIPTION_ID` +9. Add the output as github's action secret `AZUREML_TEST_CREDENTIALS` under repository's **Settings > Security > Secrets and variables > Actions**. ## How to execute tests in your local environment From c4205c1098bd85e6a9df4d42f72a77746ede32e4 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sat, 5 Oct 2024 10:50:22 +0200 Subject: [PATCH 58/67] Update the code of conduct Signed-off-by: miguelgfierro --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 72ee78004..7f78b29f8 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ We provide a [benchmark notebook](examples/06_benchmarks/movielens.ipynb) to ill This project welcomes contributions and suggestions. Before contributing, please see our [contribution guidelines](CONTRIBUTING.md). -This project adheres to [Microsoft's Open Source Code of Conduct](CODE_OF_CONDUCT.md) in order to foster a welcoming and inspiring community for all. +This project adheres to this [Code of Conduct](CODE_OF_CONDUCT.md) in order to foster a welcoming and inspiring community for all. ## Build Status From 7cb4d491a4bd6096773df720f421ff0f21ad9f1b Mon Sep 17 00:00:00 2001 From: ved93 Date: Mon, 21 Oct 2024 16:07:36 +0530 Subject: [PATCH 59/67] added databricks setup instructions Signed-off-by: ved93 --- SETUP.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/SETUP.md b/SETUP.md index 814118a49..323aefddf 100644 --- a/SETUP.md +++ b/SETUP.md @@ -50,16 +50,19 @@ pip install recommenders[spark] # c. Run the notebook. ``` -## Setup for Azure Databricks +## Setup for Databricks -The following instructions were tested on Azure Databricks Runtime 12.2 LTS (Apache Spark version 3.3.2) and 11.3 LTS (Apache Spark version 3.3.0). -As of April 2023, Databricks Runtime 13 is not yet supported as it is on Python 3.10. +The following instructions were tested on Databricks Runtime 15.4 LTS (Apache Spark version 3.5.0), 14.3 LTS (Apache Spark version 3.5.0), 13.3 LTS (Apache Spark version 3.4.1), and 12.2 LTS (Apache Spark version 3.3.2). We have tested the runtime on python 3.9,3.10 and 3.11. -After an Azure Databricks cluster is provisioned: +After an Databricks cluster is provisioned: ```bash # 1. Go to the "Compute" tab on the left of the page, click on the provisioned cluster and then click on "Libraries". # 2. Click the "Install new" button. # 3. In the popup window, select "PyPI" as the library source. Enter "recommenders[examples]" as the package name. Click "Install" to install the package. +# 4. Now, repeat the step 3 for below packages: +# a. numpy<2.0.0 +# b. pandera<=0.18.3 +# c. scipy<=1.13.1 ``` ### Prepare Azure Databricks for Operationalization From 12bc1e421fef8e4091a7bdf2b7cea496c680c572 Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Tue, 12 Nov 2024 08:58:02 +0800 Subject: [PATCH 60/67] Use managed identity with OpenID Connect for Azure login (#2182) * Use managed identity with OpenID Connect Signed-off-by: Simon Zhao * Optimise std_log.txt path finding method Signed-off-by: Simon Zhao * statsmodels<0.14.4 Signed-off-by: Simon Zhao * numpy<1.25.0;python_version<='3.8' Signed-off-by: Simon Zhao * spacy<=3.7.5;python_version<='3.8' Signed-off-by: Simon Zhao * Update doc Signed-off-by: Simon Zhao --------- Signed-off-by: Simon Zhao --- .github/actions/azureml-test/action.yml | 14 ++++- .github/workflows/azureml-cpu-nightly.yml | 6 +- .github/workflows/azureml-gpu-nightly.yml | 6 +- .../workflows/azureml-release-pipeline.yml | 2 +- .github/workflows/azureml-spark-nightly.yml | 6 +- .github/workflows/azureml-unit-tests.yml | 6 +- setup.py | 6 +- tests/README.md | 60 ++++++++++++------- tests/ci/azureml_tests/post_pytest.py | 14 +++-- 9 files changed, 83 insertions(+), 37 deletions(-) diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml index 91a437719..7c44abb37 100644 --- a/.github/actions/azureml-test/action.yml +++ b/.github/actions/azureml-test/action.yml @@ -15,9 +15,15 @@ inputs: TEST_KIND: required: true description: Type of test - unit or nightly - AZUREML_TEST_CREDENTIALS: + AZUREML_TEST_UMI_CLIENT_ID: required: true - description: Credentials for AzureML login + description: AzureML User-managed identity client ID + AZUREML_TEST_UMI_TENANT_ID: + required: true + description: AzureML User-managed identity tenant ID + AZUREML_TEST_UMI_SUB_ID: + required: true + description: AzureML User-managed identity subscription ID AZUREML_TEST_SUBID: required: true description: AzureML subscription ID @@ -53,7 +59,9 @@ runs: - name: Log in to Azure uses: azure/login@v2 with: - creds: ${{ inputs.AZUREML_TEST_CREDENTIALS }} + client-id: ${{ inputs.AZUREML_TEST_UMI_CLIENT_ID }} + tenant-id: ${{ inputs.AZUREML_TEST_UMI_TENANT_ID }} + subscription-id: ${{ inputs.AZUREML_TEST_UMI_SUB_ID }} - name: Submit tests to AzureML shell: bash run: | diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml index 89fc64757..616707f7f 100644 --- a/.github/workflows/azureml-cpu-nightly.yml +++ b/.github/workflows/azureml-cpu-nightly.yml @@ -64,6 +64,8 @@ jobs: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} runs-on: ubuntu-latest + permissions: + id-token: write # This is required for requesting the JWT strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: @@ -79,7 +81,9 @@ jobs: EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} + AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }} + AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }} + AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_GROUP: ${{ matrix.test-group }} diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml index 16e3e6ed2..23cffda0a 100644 --- a/.github/workflows/azureml-gpu-nightly.yml +++ b/.github/workflows/azureml-gpu-nightly.yml @@ -64,6 +64,8 @@ jobs: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} runs-on: ubuntu-latest + permissions: + id-token: write # This is required for requesting the JWT strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: @@ -79,7 +81,9 @@ jobs: EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} + AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }} + AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }} + AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_GROUP: ${{ matrix.test-group }} diff --git a/.github/workflows/azureml-release-pipeline.yml b/.github/workflows/azureml-release-pipeline.yml index d9899658e..983cce9db 100644 --- a/.github/workflows/azureml-release-pipeline.yml +++ b/.github/workflows/azureml-release-pipeline.yml @@ -37,7 +37,7 @@ jobs: - name: Setup python uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.10" - name: Install wheel package run: pip install wheel - name: Create wheel from setup.py diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml index 97789fccf..da508ebe4 100644 --- a/.github/workflows/azureml-spark-nightly.yml +++ b/.github/workflows/azureml-spark-nightly.yml @@ -63,6 +63,8 @@ jobs: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} runs-on: ubuntu-latest + permissions: + id-token: write # This is required for requesting the JWT strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: @@ -78,7 +80,9 @@ jobs: EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }} ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'nightly' - AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} + AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }} + AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }} + AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_GROUP: ${{ matrix.test-group }} diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index ed3b5a98d..0f7ed2a18 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -53,6 +53,8 @@ jobs: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} runs-on: ubuntu-latest + permissions: + id-token: write # This is required for requesting the JWT strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: @@ -68,7 +70,9 @@ jobs: EXP_NAME: recommenders-unit-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.sha }} ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }} TEST_KIND: 'unit' - AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }} + AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }} + AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }} + AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }} AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }} PYTHON_VERSION: ${{ matrix.python-version }} TEST_GROUP: ${{ matrix.test-group }} diff --git a/setup.py b/setup.py index 03df519ed..41b4c8aed 100644 --- a/setup.py +++ b/setup.py @@ -36,15 +36,15 @@ "nltk>=3.8.1,<4", # requires tqdm "notebook>=6.5.5,<8", # requires ipykernel, jinja2, jupyter, nbconvert, nbformat, packaging, requests "numba>=0.57.0,<1", - "numpy<2.0.0", # FIXME: Remove numpy<2.0.0 once cornac release a version newer than 2.2.1 that resolve ImportError: numpy.core.multiarray failed to import. "pandas>2.0.0,<3.0.0", # requires numpy "pandera[strategies]>=0.6.5,<0.18;python_version<='3.8'", # For generating fake datasets "pandera[strategies]>=0.15.0;python_version>='3.9'", "retrying>=1.3.4,<2", "scikit-learn>=1.2.0,<2", # requires scipy, and introduce breaking change affects feature_extraction.text.TfidfVectorizer.min_df "scikit-surprise>=1.1.3", - "scipy>=1.10.1,<=1.13.1", # FIXME: Remove scipy<=1.13.1 once cornac release a version newer than 2.2.1. See #2128 "seaborn>=0.13.0,<1", # requires matplotlib, packaging + "statsmodels<=0.14.1;python_version<='3.8'", + "statsmodels>=0.14.4;python_version>='3.9'", "transformers>=4.27.0,<5", # requires packaging, pyyaml, requests, tqdm ] @@ -52,7 +52,9 @@ extras_require = { "gpu": [ "fastai>=2.7.11,<3", + "numpy<1.25.0;python_version<='3.8'", "nvidia-ml-py>=11.525.84", + "spacy<=3.7.5;python_version<='3.8'", "tensorflow>=2.8.4,!=2.9.0.*,!=2.9.1,!=2.9.2,!=2.10.0.*,<2.16", # Fixed TF due to constant security problems and breaking changes #2073 "tf-slim>=1.1.0", # No python_requires in its setup.py "torch>=2.0.1,<3", diff --git a/tests/README.md b/tests/README.md index 4958cedd2..893df94c2 100644 --- a/tests/README.md +++ b/tests/README.md @@ -216,30 +216,46 @@ Then, follow the steps below to create the AzureML infrastructure: - Name: `azureml-test-workspace` - Resource group: `recommenders_project_resources` - Location: *Make sure you have enough quota in the location you choose* -2. Create two new clusters: `cpu-cluster` and `gpu-cluster`. Go to compute, then compute cluster, then new. +1. Create two new clusters: `cpu-cluster` and `gpu-cluster`. Go to compute, then compute cluster, then new. - Select the CPU VM base. Anything above 64GB of RAM, and 8 cores should be fine. - Select the GPU VM base. Anything above 56GB of RAM, and 6 cores, and an NVIDIA K80 should be fine. -3. Add the subscription ID to GitHub action secrets [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions). Create a new repository secret called `AZUREML_TEST_SUBID` and add the subscription ID as the value. -4. Make sure you have installed [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli), and that you are logged in: `az login`. -5. Select your subscription: `az account set -s $AZURE_SUBSCRIPTION_ID`. -6. Create a Service Principal: `az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME --role "AzureML Compute Operator" --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --json-auth`. This will output a JSON blob with the credentials of the Service Principal: - ``` - { - "clientId": "XXXXXXXXXXXXXXXXXXXXX", - "clientSecret": "XXXXXXXXXXXXXXXXXXXXX", - "subscriptionId": "XXXXXXXXXXXXXXXXXXXXX", - "tenantId": "XXXXXXXXXXXXXXXXXXXXX", - "activeDirectoryEndpointUrl": "https://login.microsoftonline.com", - "resourceManagerEndpointUrl": "https://management.azure.com/", - "activeDirectoryGraphResourceId": "https://graph.windows.net/", - "sqlManagementEndpointUrl": "https://management.core.windows.net:8443/", - "galleryEndpointUrl": "https://gallery.azure.com/", - "managementEndpointUrl": "https://management.core.windows.net/" - } - ``` -7. Assign AzureML Data Scientist role: `az role assignment create --assignee $SERVICE_PRINCIPAL_NAME --role "AzureML Data Scientist" --scope /subscriptions/$AZURE_SUBSCRIPTION_ID` -8. Assign Reader role: `az role assignment create --assignee $SERVICE_PRINCIPAL_NAME --role "Reader" --scope /subscriptions/$AZURE_SUBSCRIPTION_ID` -9. Add the output as github's action secret `AZUREML_TEST_CREDENTIALS` under repository's **Settings > Security > Secrets and variables > Actions**. +1. Add the subscription ID to GitHub action secrets + [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions). + * Create a new repository secret called `AZUREML_TEST_SUBID` and + add the subscription ID as the value. +1. Set up [login with OpenID Connect + (OIDC)](https://github.com/marketplace/actions/azure-login#login-with-openid-connect-oidc-recommended) + for GitHub Actions. + 1. Create a user-assigned managed identity (UMI) and assign the + following 3 roles of the AzureML workspace created above to the + UMI (See [Create a user-assigned managed + identity](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-manage-user-assigned-managed-identities?pivots=identity-mi-methods-azp#create-a-user-assigned-managed-identity)): + * AzureML Compute Operator + * AzureML Data Scientist + * Reader + 1. [Create a federated identiy credential on the + UMI](https://learn.microsoft.com/en-us/entra/workload-id/workload-identity-federation-create-trust-user-assigned-managed-identity?pivots=identity-wif-mi-methods-azp#github-actions-deploying-azure-resources) + with the following settings: + * Name: A unique name for the federated identity credential + within your application. + * Issuer: Set to `https://token.actions.githubusercontent.com` + for GitHub Actions. + * Subject: The subject claim format, e.g., + `repo:recommenders-team/recommenders:ref:refs/heads/`: + + `repo:recommenders-team/recommenders:pull_request` + + `repo:recommenders-team/recommenders:ref:refs/heads/staging` + + `repo:recommenders-team/recommenders:ref:refs/heads/main` + * Description: (Optional) A description of the credential. + * Audiences: Specifies who can use this credential; for GitHub + Actions, use `api://AzureADTokenExchange`. +1. Create 3 Actions secrets + * `AZUREML_TEST_UMI_TENANT_ID` + * `AZUREML_TEST_UMI_SUB_ID` + * `AZUREML_TEST_UMI_CLIENT_ID` + + and use the UMI's tenant ID, subscription ID and client ID as the + values of the secrets, respectively, under the repository's + **Settings > Security > Secrets and variables > Actions**. ## How to execute tests in your local environment diff --git a/tests/ci/azureml_tests/post_pytest.py b/tests/ci/azureml_tests/post_pytest.py index b457e709d..26472ea46 100644 --- a/tests/ci/azureml_tests/post_pytest.py +++ b/tests/ci/azureml_tests/post_pytest.py @@ -89,8 +89,12 @@ def parse_args(): run_id=run.info.run_id, dst_path=args.log_dir, ) - log_path = pathlib.Path("user_logs/std_log.txt") - with open(pathlib.Path(args.log_dir) / log_path, "r") as file: - print(f"\nDumping logs in {log_path}") - print("=====================================") - print(file.read()) + log_path = next( + (path for path in pathlib.Path(args.log_dir).rglob("std_log.txt")), + None + ) + if log_path is not None: + with open(log_path, "r") as file: + print(f"\nDumping logs in {log_path}") + print("=====================================") + print(file.read()) From f2abb555efcb0a59594e92b00c054c3b23e50543 Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Thu, 14 Nov 2024 11:15:30 +0800 Subject: [PATCH 61/67] Merge multiple Dockerfiles into a single one (#2167) Merge multiple Dockerfiles into a single one --------- Signed-off-by: Simon Zhao Co-authored-by: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> --- .devcontainer/devcontainer.json | 54 ++-- SETUP.md | 85 ++++++ tests/README.md | 23 +- tests/ci/azureml_tests/aml_utils.py | 110 +++----- .../submit_groupwise_azureml_pytest.py | 28 +- tools/docker/Dockerfile | 262 +++++++----------- tools/docker/README.md | 148 +++++----- 7 files changed, 360 insertions(+), 350 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 12d6ed822..80ee30f02 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,22 +1,18 @@ { "name": "Recommenders", - // Version list: https://github.com/devcontainers/images/tree/main/src/base-ubuntu - // Includes: curl, wget, ca-certificates, git, Oh My Zsh!, - "image": "mcr.microsoft.com/devcontainers/base:ubuntu-24.04", - "hostRequirements": { - "cpus": 4, - "memory": "16gb", - "storage": "32gb" - }, - "features": { - // https://github.com/devcontainers/features/blob/main/src/anaconda/devcontainer-feature.json - "ghcr.io/devcontainers/features/anaconda:1": { - "version": "2024.06-1" + "build": { + "dockerfile": "../tools/docker/Dockerfile", + "context": "..", + "target": "deps", + "args": { + "COMPUTE": "cpu", + "PYTHON_VERSION": "3.11" } }, "customizations": { "vscode": { - // Set *default* container specific settings.json values on container create. + // Set default container specific settings.json values on container + // create "settings": { "[python]": { "editor.defaultFormatter": "ms-python.black-formatter", @@ -27,24 +23,32 @@ }, "isort.args": ["--profile", "black"], "python.analysis.autoImportCompletions": true, - "python.defaultInterpreterPath": "/usr/local/conda/envs/Recommenders/bin/python", + // Conda env name *must* align with the one in Dockerfle + "python.defaultInterpreterPath": "/root/conda/envs/Recommenders/bin/python", "python.testing.pytestEnabled": true, - // set the directory where all tests are + // Test directory "python.testing.pytestArgs": ["tests"] }, - // Add the IDs of extensions you want installed when the container is created. + // VS Code extensions to install on container create "extensions": [ - "ms-python.black-formatter", // https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter - "ms-python.isort", // https://marketplace.visualstudio.com/items?itemName=ms-python.isort - "ms-python.mypy-type-checker", // https://marketplace.visualstudio.com/items?itemName=ms-python.mypy-type-checker - "ms-python.pylint", // https://marketplace.visualstudio.com/items?itemName=ms-python.pylint - "ms-python.python", // https://marketplace.visualstudio.com/items?itemName=ms-python.python - "ms-toolsai.datawrangler", // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.datawrangler - "ms-toolsai.jupyter" // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter + // https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter + "ms-python.black-formatter", + // https://marketplace.visualstudio.com/items?itemName=ms-python.isort + "ms-python.isort", + // https://marketplace.visualstudio.com/items?itemName=ms-python.mypy-type-checker + "ms-python.mypy-type-checker", + // https://marketplace.visualstudio.com/items?itemName=ms-python.pylint + "ms-python.pylint", + // https://marketplace.visualstudio.com/items?itemName=ms-python.python + "ms-python.python", + // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.datawrangler + "ms-toolsai.datawrangler", + // https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter + "ms-toolsai.jupyter" ] } }, - // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "conda create -n Recommenders -c conda-forge -y python=3.10 openjdk=21 pip && conda init bash && bash -c -i 'conda activate Recommenders && pip install -e .[dev,spark]' && conda config --set auto_activate_base false" + // Install Recommenders in development mode after container create + "postCreateCommand": "bash -i -c 'conda activate Recommenders && conda install -c conda-forge -y openjdk=21 && pip install -e .[dev,spark]'" } diff --git a/SETUP.md b/SETUP.md index 323aefddf..4593b4cfb 100644 --- a/SETUP.md +++ b/SETUP.md @@ -145,6 +145,91 @@ git checkout staging pip install -e .[all] ``` +We also provide a [devcontainer.json](./.devcontainer/devcontainer.json) +and [Dockerfile](./tools/docker/Dockerfile) for developers to +facilitate the development on +[Dev Containers with VS Code](https://code.visualstudio.com/docs/devcontainers/containers) +and [GitHub Codespaces](https://github.com/features/codespaces). + +
+VS Code Dev Containers + +The typical scenario using Docker containers for development is as +follows. Say, we want to develop applications for a specific +environment, so +1. we create a contaienr with the dependencies required, +1. and mount the folder containing the code to the container, +1. then code parsing, debugging and testing are all performed against + the container. +This workflow seperates the development environment from your local +environment, so that your local environment won't be affected. The +container used here for this end is called Dev Container in the +VS Code Dev Containers extension. And the extension eases this +development workflow with Docker containers automatically without +pain. + +To use VS Code Dev Containers, your local machine must have the +following applicatioins installed: +* [Docker](https://docs.docker.com/get-started/get-docker/) +* [VS Code Remote Development Extension Pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack) + +Then +* When you open your local Recommenders folder in VS Code, it will + detect [devcontainer.json](./.devcontainer/devcontainer.json), and + prompt you to **Reopen in Container**. If you'd like to reopen, + it will create a container with the required environment described + in `devcontainer.json`, install a VS Code server in the container, + and mount the folder into the container. + + If you don't see the prompt, you can use the command + **Dev Containers: Reopen in Container** +* If you don't have a local clone of Recommenders, you can also use + the command **Dev Containers: Clone Repository in Container Volume**, + and type in a branch/PR URL of Recommenders you'd like to develop + on, such as https://github.com/recommenders-team/recommenders, + https://github.com/recommenders-team/recommenders/tree/staging, or + https://github.com/recommenders-team/recommenders/pull/2098. VS + Code will create a container with the environment described in + `devcontainer.json`, and clone the specified branch of Recommenders + into the container. + +Once everything is set up, VS Code will act as a client to the server +in the container, and all subsequent operations on VS Code will be +performed against the container. + +
+ +
+GitHub Codespaces + +GitHub Codespaces also uses `devcontainer.json` and Dockerfile in the +repo to create the environment on a VM for you to develop on the Web +VS Code. To use the GitHub Codespaces on Recommenders, you can go to +[Recommenders](https://github.com/recommenders-team/recommenders) +$\to$ switch to the branch of interest $\to$ Code $\to$ Codespaces +$\to$ Create codespaces on the branch. + +
+ +
+devcontainer.json & Dockerfile + +[devcontainer.json](./.devcontainer/devcontainer.json) describes: +* the Dockerfile to use with configurable build arguments, such as + `COMPUTE` and `PYTHON_VERSION`. +* settings on VS Code server, such as Python interpreter path in the + container, Python formatter. +* extensions on VS Code server, such as black-formatter, pylint. +* how to create the Conda environment for Recommenders in + `postCreateCommand` + +[Dockerfile](./tools/docker/Dockerfile) is used in 3 places: +* Dev containers on VS Code and GitHub Codespaces +* [Testing workflows on AzureML](./tests/README.md) +* [Jupyter notebook examples on Docker](./tools/docker/README.md) + +
+ + ## Test Environments Depending on the type of recommender system and the notebook that needs to be run, there are different computational requirements. diff --git a/tests/README.md b/tests/README.md index 893df94c2..cb938c427 100644 --- a/tests/README.md +++ b/tests/README.md @@ -63,9 +63,26 @@ GitHub workflows `azureml-unit-tests.yml`, `azureml-cpu-nightly.yml`, `azureml-g There are three scripts used with each workflow, all of them are located in [ci/azureml_tests](./ci/azureml_tests/): -* `submit_groupwise_azureml_pytest.py`: this script uses parameters in the workflow yml to set up the AzureML environment for testing using the AzureML SDK. -* `run_groupwise_pytest.py`: this script uses pytest to run the tests of the libraries and notebooks. This script runs in an AzureML workspace with the environment created by the script above. -* `test_groups.py`: this script defines the groups of tests. If the tests are part of the unit tests, the total compute time of each group should be less than 15min. If the tests are part of the nightly builds, the total time of each group should be less than 35min. +* [`submit_groupwise_azureml_pytest.py`](./ci/azureml_tests/submit_groupwise_azureml_pytest.py): + this script uses parameters in the workflow yml to set up the + AzureML environment for testing using the AzureML SDK. +* [`run_groupwise_pytest.py`](./ci/azureml_tests/run_groupwise_pytest.pyy): + this script uses pytest to run the tests of the libraries and + notebooks. This script runs in an AzureML workspace with the + environment created by the script above. +* [`aml_utils.py`](./ci/azureml_tests/aml_utils.py): this script + defines several utility functions using + [the AzureML Python SDK v2](https://learn.microsoft.com/en-us/azure/machine-learning/concept-v2?view=azureml-api-2). + These functions are used by the scripts above to set up the compute and + the environment for the tests on AzureML. For example, the + environment with all dependencies of Recommenders is created by the + function `get_or_create_environment` via the [Dockerfile](../tools/docker/Dockerfile). + More details on Docker support can be found at [tools/docker/README.md](../tools/docker/README.md). +* [`test_groups.py`](./ci/azureml_tests/test_groups.py): this script + defines the groups of tests. If the tests are part of the unit + tests, the total compute time of each group should be less than + 15min. If the tests are part of the nightly builds, the total time + of each group should be less than 35min. ## How to contribute tests to the repository diff --git a/tests/ci/azureml_tests/aml_utils.py b/tests/ci/azureml_tests/aml_utils.py index 5a4d488e3..3e9a6782e 100644 --- a/tests/ci/azureml_tests/aml_utils.py +++ b/tests/ci/azureml_tests/aml_utils.py @@ -8,7 +8,7 @@ * https://learn.microsoft.com/en-us/azure/machine-learning/reference-migrate-sdk-v1-mlflow-tracking?view=azureml-api-2&tabs=aml%2Ccli%2Cmlflow """ import pathlib -import tempfile +import re from azure.ai.ml import MLClient, command from azure.ai.ml.entities import AmlCompute, BuildContext, Environment, Workspace @@ -16,6 +16,7 @@ from azure.core.exceptions import ResourceExistsError from azure.identity import DefaultAzureCredential + def get_client(subscription_id, resource_group, workspace_name): """ Get the client with specified AzureML workspace, or create one if not existing. @@ -61,9 +62,8 @@ def get_or_create_environment( environment_name, use_gpu, use_spark, - conda_pkg_jdk, + conda_openjdk_version, python_version, - commit_sha, ): """ AzureML requires the run environment to be setup prior to submission. @@ -77,81 +77,39 @@ def get_or_create_environment( added to the conda environment, else False use_spark (bool): True if PySpark packages should be added to the conda environment, else False - conda_pkg_jdk (str): "openjdk=8" by default - python_version (str): python version, such as "3.9" - commit_sha (str): the commit that triggers the workflow + conda_openjdk_version (str): "21" by default + python_version (str): python version, such as "3.11" """ - conda_env_name = "reco" - conda_env_yml = "environment.yml" - condafile = fr""" -name: {conda_env_name} -channels: - - conda-forge -dependencies: - - python={python_version} - - {conda_pkg_jdk} - - pip - - pip: - - recommenders[dev{",gpu" if use_gpu else ""}{",spark" if use_spark else ""}]@git+https://github.com/recommenders-team/recommenders.git@{commit_sha} -""" - # See https://github.com/Azure/AzureML-Containers/blob/master/base/cpu/openmpi4.1.0-ubuntu22.04 - image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04" - # See https://github.com/Azure/AzureML-Containers/blob/master/base/gpu/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04 - dockerfile = fr"""# syntax=docker/dockerfile:1 -FROM nvcr.io/nvidia/cuda:12.5.1-devel-ubuntu22.04 -SHELL ["/bin/bash", "-c"] -USER root:root -ENV NVIDIA_VISIBLE_DEVICES all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -ENV DEBIAN_FRONTEND noninteractive -RUN apt-get update && \ - apt-get install -y wget git-all && \ - apt-get clean -y && \ - rm -rf /var/lib/apt/lists/* - -# Install Conda -ENV CONDA_PREFIX /opt/miniconda -RUN wget -qO /tmp/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_24.5.0-0-Linux-x86_64.sh && \ - bash /tmp/miniconda.sh -bf -p ${{CONDA_PREFIX}} && \ - ${{CONDA_PREFIX}}/bin/conda update --all -c conda-forge -y && \ - ${{CONDA_PREFIX}}/bin/conda clean -ay && \ - rm -rf ${{CONDA_PREFIX}}/pkgs && \ - rm /tmp/miniconda.sh && \ - find / -type d -name __pycache__ | xargs rm -rf - -# Create Conda environment -COPY {conda_env_yml} /tmp/{conda_env_yml} -RUN ${{CONDA_PREFIX}}/bin/conda env create -f /tmp/{conda_env_yml} - -# Activate Conda environment -ENV CONDA_DEFAULT_ENV {conda_env_name} -ENV CONDA_PREFIX ${{CONDA_PREFIX}}/envs/${{CONDA_DEFAULT_ENV}} -ENV PATH="${{CONDA_PREFIX}}/bin:${{PATH}}" LD_LIBRARY_PATH="${{CONDA_PREFIX}}/lib:$LD_LIBRARY_PATH" -""" - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = pathlib.Path(tmpdir) - dockerfile_path = tmpdir / "Dockerfile" - condafile_path = tmpdir / conda_env_yml - build = BuildContext(path=tmpdir, dockerfile_path=dockerfile_path.name) - - with open(dockerfile_path, "w") as file: - file.write(dockerfile) - with open(condafile_path, "w") as file: - file.write(condafile) - - try: - client.environments.create_or_update( - Environment( - name=environment_name, - image=None if use_gpu else image, - build=build if use_gpu else None, - conda_file=None if use_gpu else condafile_path, - ) + compute = "gpu" if use_gpu else "cpu" + extras = ( + "[dev" + (",gpu" if use_gpu else "") + (",spark" if use_spark else "") + "]" + ) + dockerfile = pathlib.Path("tools/docker/Dockerfile") + + # Docker's --build-args is not supported by AzureML Python SDK v2 as shown + # in [the issue #33902](https://github.com/Azure/azure-sdk-for-python/issues/33902) + # so the build args are configured by regex substituion + text = dockerfile.read_text() + text = re.sub(r"(ARG\sCOMPUTE=).*", rf'\1"{compute}"', text) + text = re.sub(r"(ARG\sEXTRAS=).*", rf'\1"{extras}"', text) + text = re.sub(r"(ARG\sGIT_REF=).*", r'\1""', text) + text = re.sub(r"(ARG\sJDK_VERSION=).*", rf'\1"{conda_openjdk_version}"', text) + text = re.sub(r"(ARG\sPYTHON_VERSION=).*", rf'\1"{python_version}"', text) + dockerfile.write_text(text) + + try: + client.environments.create_or_update( + Environment( + name=environment_name, + build=BuildContext( + # Set path for Docker to access to Recommenders root + path=".", + dockerfile_path=dockerfile, + ), ) - except ResourceExistsError: - pass + ) + except ResourceExistsError: + pass def run_tests( diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py index 4ce6106bf..02698015e 100644 --- a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py +++ b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py @@ -108,15 +108,15 @@ def parse_args(): help="Environment name on AzureML", ) parser.add_argument( - "--conda_pkg_jdk", + "--conda-openjdk-version", action="store", - default="openjdk=8", - help="Conda package for JDK", + default="21", + help="Conda OpenJDK package version", ) parser.add_argument( "--python-version", action="store", - default="3.8", + default="3.11", help="Python version", ) parser.add_argument( @@ -133,19 +133,16 @@ def parse_args(): logger = logging.getLogger("submit_groupwise_azureml_pytest.py") args = parse_args() - logger.info(f"Setting up workspace {args.ws}") + logger.info("Setting up workspace %s", args.ws) client = get_client( subscription_id=args.subid, resource_group=args.rg, workspace_name=args.ws, ) - logger.info(f"Setting up compute {args.cluster}") + logger.info("Setting up compute %s", args.cluster) create_or_start_compute( - client=client, - name=args.cluster, - size=args.vmsize, - max_instances=args.maxnodes + client=client, name=args.cluster, size=args.vmsize, max_instances=args.maxnodes ) # TODO: Unlike Azure DevOps pipelines, GitHub Actions only has simple @@ -159,19 +156,18 @@ def parse_args(): # * on AzureML # recommenders-unit-group_cpu_001-python3_8-c8adeafabc011b549f875dc145313ffbe3fc53a8 environment_name = correct_resource_name(args.envname) - logger.info(f"Setting up environment {environment_name}") + logger.info("Setting up environment %s", environment_name) get_or_create_environment( client=client, environment_name=environment_name, - use_gpu=True if "gpu" in args.testgroup else False, - use_spark=True if "spark" in args.testgroup else False, - conda_pkg_jdk=args.conda_pkg_jdk, + use_gpu="gpu" in args.testgroup, + use_spark="spark" in args.testgroup, + conda_openjdk_version=args.conda_openjdk_version, python_version=args.python_version, - commit_sha=args.sha, ) experiment_name = correct_resource_name(args.expname) - logger.info(f"Running experiment {experiment_name}") + logger.info("Running experiment %s", experiment_name) run_tests( client=client, compute=args.cluster, diff --git a/tools/docker/Dockerfile b/tools/docker/Dockerfile index fee64adfb..fc5ba4abf 100644 --- a/tools/docker/Dockerfile +++ b/tools/docker/Dockerfile @@ -1,189 +1,115 @@ +# syntax=docker/dockerfile:1 + # Copyright (c) Recommenders contributors. # Licensed under the MIT License. -ARG ENV="cpu" -ARG HOME="/root" +##################################################################### +# Stage build order depending on the compute: +# Compute Stage (CPU/GPU) -> Dependencies Stage -> Final Stage +##################################################################### +# Valid computes: cpu, gpu +ARG COMPUTE="cpu" -FROM mcr.microsoft.com/mirror/docker/library/ubuntu:18.04 AS base -LABEL maintainer="Microsoft Recommender Project " +##################################################################### +# Compute Stage - CPU +# Choose an appropriate CPU compute image +##################################################################### +# * [buildpack-deps:24.04](https://github.com/docker-library/buildpack-deps/blob/master/ubuntu/noble/Dockerfile) +# + [Created on 2024-08-17](https://hub.docker.com/layers/library/buildpack-deps/noble/images/sha256-dbfee7e7ee2340b0d6567efd3a8a9281ce45ee78598485b4d7a7f09fe641811a) +FROM buildpack-deps@sha256:dbfee7e7ee2340b0d6567efd3a8a9281ce45ee78598485b4d7a7f09fe641811a AS cpu -ARG HOME -ARG VIRTUAL_ENV -ENV HOME="${HOME}" -WORKDIR ${HOME} -# Exit if VIRTUAL_ENV is not specified correctly -RUN if [ "${VIRTUAL_ENV}" != "conda" ] && [ "${VIRTUAL_ENV}" != "venv" ] && [ "${VIRTUAL_ENV}" != "virtualenv" ]; then \ - echo 'VIRTUAL_ENV argument should be either "conda", "venv" or "virtualenv"'; exit 1; fi +##################################################################### +# Compute Stage - GPU +# Choose an appropriate GPU compute image +##################################################################### +# * [nvidia/cuda:12.6.1-devel-ubuntu24.04](https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/12.6.1/ubuntu2404/devel/Dockerfile) +# + [Created on 2024-09-13](https://hub.docker.com/layers/nvidia/cuda/12.6.1-devel-ubuntu24.04/images/sha256-bfc293f21611f3c47a3442cf6516ebfe99d529926a4bef4bc389ef02fd038800) +# * See also [AML GPU Base Image](https://github.com/Azure/AzureML-Containers/blob/master/base/gpu/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04) +FROM nvcr.io/nvidia/cuda:12.6.1-devel-ubuntu24.04@sha256:bfc293f21611f3c47a3442cf6516ebfe99d529926a4bef4bc389ef02fd038800 AS gpu -# Install base dependencies and libpython (for cornac) -RUN apt-get update && \ - apt-get install -y curl build-essential -RUN apt-get install -y libpython3.7-dev -RUN apt-get install -y python3-dev - -# Install Anaconda -ARG ANACONDA="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" -RUN if [ "${VIRTUAL_ENV}" = "conda" ] ; then curl ${ANACONDA} -o anaconda.sh && \ - /bin/bash anaconda.sh -b -p conda && \ - rm anaconda.sh && \ - echo ". ${HOME}/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate base" >> ~/.bashrc ; fi - -ENV PATH="${HOME}/${VIRTUAL_ENV}/bin:${PATH}" - -# --login option used to source bashrc (thus activating conda env) at every RUN statement -SHELL ["/bin/bash", "--login", "-c"] - -# Python version supported by recommenders -RUN if [ "${VIRTUAL_ENV}" = "conda" ] ; then conda install python=3.7; fi -RUN if [ "${VIRTUAL_ENV}" = "venv" ] ; then apt-get -y install python3.7; \ - apt-get -y install python3-pip; \ - apt-get -y install python3.7-venv; fi -RUN if [ "${VIRTUAL_ENV}" = "virtualenv" ] ; then apt-get -y install python3.7; \ - apt-get -y install python3-pip; \ - python3.7 -m pip install --user virtualenv; fi - -# Activate the virtual environment -RUN if [ "${VIRTUAL_ENV}" = "venv" ] ; then python3.7 -m venv $HOME/${VIRTUAL_ENV}; \ - source $HOME/${VIRTUAL_ENV}/bin/activate; \ - pip install --upgrade pip; \ - pip install --upgrade setuptools wheel; fi -RUN if [ "${VIRTUAL_ENV}" = "virtualenv" ] ; then python3.7 -m virtualenv $HOME/${VIRTUAL_ENV}; \ - source $HOME/${VIRTUAL_ENV}/bin/activate; \ - pip install --upgrade pip; \ - pip install --upgrade setuptools wheel; fi - -########### -# CPU Stage -########### -FROM base AS cpu - -RUN if [ "${VIRTUAL_ENV}" = "venv" ] || [ "${VIRTUAL_ENV}" = "virtualenv" ]; then source $HOME/${VIRTUAL_ENV}/bin/activate; \ - pip install recommenders[examples]; fi -RUN if [ "${VIRTUAL_ENV}" = "conda" ] ; then pip install recommenders[examples]; fi - - -############### -# PySpark Stage -############### -FROM base AS pyspark - -# Install Java version 8 RUN apt-get update && \ - apt-get install -y libgomp1 openjdk-8-jre - -ENV JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" \ - PYSPARK_PYTHON="${HOME}/${VIRTUAL_ENV}/bin/python" \ - PYSPARK_DRIVER_PYTHON="${HOME}/${VIRTUAL_ENV}/bin/python" + DEBIAN_FRONTEND=noninteractive \ + apt-get install -y wget git && \ + apt-get clean -y && \ + rm -rf /var/lib/apt/lists/* -# Install dependencies in virtual environment -RUN if [ "${VIRTUAL_ENV}" = "venv" ] || [ "${VIRTUAL_ENV}" = "virtualenv" ]; then source $HOME/${VIRTUAL_ENV}/bin/activate; \ - pip install recommenders[spark,examples]; fi -RUN if [ "${VIRTUAL_ENV}" = "conda" ] ; then pip install recommenders[spark,examples]; fi +##################################################################### +# Dependencies Stage +# Set up all dependencies. This Stage is used by dev containers, +# because editable installation is required. +##################################################################### +FROM ${COMPUTE} AS deps -########### -# GPU Stage -########### -FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:11.4.2-cudnn8-runtime-ubuntu18.04 AS gpu +# Valid versions: 3.8, 3.9, 3.10, 3.11 +ARG PYTHON_VERSION="3.11" -ARG HOME -ARG VIRTUAL_ENV -ENV HOME="${HOME}" -WORKDIR ${HOME} +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -# Exit if VIRTUAL_ENV is not specified correctly -RUN if [ "${VIRTUAL_ENV}" != "conda" ] && [ "${VIRTUAL_ENV}" != "venv" ] && [ "${VIRTUAL_ENV}" != "virtualenv" ]; then \ - echo 'VIRTUAL_ENV argument should be either "conda", "venv" or "virtualenv"'; exit 1; fi +WORKDIR /root +USER root:root -RUN apt-get update && \ - apt-get install -y curl build-essential -RUN apt-get install -y libpython3.7-dev -RUN apt-get install -y python3-dev - -# Install Anaconda -ARG ANACONDA="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" -RUN if [ "${VIRTUAL_ENV}" = "conda" ] ; then curl ${ANACONDA} -o anaconda.sh && \ - /bin/bash anaconda.sh -b -p conda && \ - rm anaconda.sh && \ - echo ". ${HOME}/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate base" >> ~/.bashrc; fi - -ENV PATH="${HOME}/${VIRTUAL_ENV}/bin:${PATH}" - -SHELL ["/bin/bash", "--login", "-c"] - -RUN if [ "${VIRTUAL_ENV}" = "conda" ] ; then conda install python=3.7; fi -RUN if [ "${VIRTUAL_ENV}" = "venv" ] ; then apt-get -y install python3.7; \ - apt-get -y install python3-pip; \ - apt-get -y install python3.7-venv; fi -RUN if [ "${VIRTUAL_ENV}" = "virtualenv" ] ; then apt-get -y install python3.7; \ - apt-get -y install python3-pip; \ - python3.7 -m pip install --user virtualenv; fi - -# Activate the virtual environment -RUN if [ "${VIRTUAL_ENV}" = "venv" ] ; then python3.7 -m venv $HOME/${VIRTUAL_ENV}; \ - source $HOME/${VIRTUAL_ENV}/bin/activate; \ - pip install --upgrade pip; \ - pip install --upgrade setuptools wheel; \ - pip install recommenders[gpu,examples]; fi -RUN if [ "${VIRTUAL_ENV}" = "virtualenv" ] ; then python3.7 -m virtualenv $HOME/${VIRTUAL_ENV}; \ - source $HOME/${VIRTUAL_ENV}/bin/activate; \ - pip install --upgrade pip; \ - pip install --upgrade setuptools wheel; \ - pip install recommenders[gpu,examples]; fi - -RUN if [ "${VIRTUAL_ENV}" = "conda" ] ; then \ - pip install recommenders[gpu,examples] -f https://download.pytorch.org/whl/cu111/torch_stable.html ; fi - - -############ -# Full Stage -############ -FROM gpu AS full - -ARG HOME -WORKDIR ${HOME} - -SHELL ["/bin/bash", "--login", "-c"] - -# Install Java version 8 -RUN apt-get update && \ - apt-get install -y libgomp1 openjdk-8-jre +SHELL ["/bin/bash", "-c"] -ENV JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" \ - PYSPARK_PYTHON="${HOME}/${VIRTUAL_ENV}/bin/python" \ - PYSPARK_DRIVER_PYTHON="${HOME}/${VIRTUAL_ENV}/bin/python" +# Install Conda +RUN wget -qO /tmp/conda.sh "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \ + bash /tmp/conda.sh -bf -p /root/conda && \ + /root/conda/bin/conda clean -ay && \ + rm -rf /root/conda/pkgs && \ + rm /tmp/conda.sh && \ + /root/conda/bin/conda init bash && \ + /root/conda/bin/conda config --set auto_activate_base false -# Install dependencies in virtual environment -RUN if [ "${VIRTUAL_ENV}" = "venv" ] || [ "${VIRTUAL_ENV}" = "virtualenv" ]; then source $HOME/${VIRTUAL_ENV}/bin/activate; \ - pip install recommenders[all]; fi -RUN if [ "${VIRTUAL_ENV}" = "conda" ] ; then pip install recommenders[all]; fi +# Create Conda environment +RUN /root/conda/bin/conda create -n Recommenders -c conda-forge -y python=${PYTHON_VERSION} pip -############# +##################################################################### # Final Stage -############# -FROM $ENV AS final - -# Setup Jupyter notebook configuration -ENV NOTEBOOK_CONFIG="${HOME}/.jupyter/jupyter_notebook_config.py" -RUN mkdir ${HOME}/.jupyter && \ - echo "c.NotebookApp.token = ''" >> ${NOTEBOOK_CONFIG} && \ - echo "c.NotebookApp.ip = '0.0.0.0'" >> ${NOTEBOOK_CONFIG} && \ - echo "c.NotebookApp.allow_root = True" >> ${NOTEBOOK_CONFIG} && \ - echo "c.NotebookApp.open_browser = False" >> ${NOTEBOOK_CONFIG} && \ - echo "c.MultiKernelManager.default_kernel_name = 'python3'" >> ${NOTEBOOK_CONFIG} - -# Register the environment with Jupyter -RUN if [ "${VIRTUAL_ENV}" = "conda" ]; then python -m ipykernel install --user --name base --display-name "Python (base)"; fi -RUN if [ "${VIRTUAL_ENV}" = "venv" ] || [ "${VIRTUAL_ENV}" = "virtualenv" ]; then source $HOME/${VIRTUAL_ENV}/bin/activate; \ - python -m ipykernel install --user --name venv --display-name "Python (venv)"; fi - -ARG HOME -WORKDIR ${HOME} - +# Install Recommenders +##################################################################### +FROM deps AS final + +# Extra dependencies: dev, gpu, spark +ARG EXTRAS="" + +# Git ref of Recommenders to install: main, staging, etc. +# Empty value ("") indicates editable installation of current clone +ARG GIT_REF="main" + +ARG JDK_VERSION="21" + +ARG RECO_DIR="/root/Recommenders" + +# Copy Recommenders into the image +COPY ./ ${RECO_DIR} + +# Install Recommenders and its dependencies +RUN source /root/conda/bin/activate && \ + conda activate Recommenders && \ + if [[ "${EXTRAS}" =~ spark ]]; then conda install -c conda-forge -y "openjdk=${JDK_VERSION}"; fi && \ + if [ -z "${GIT_REF}" ]; then \ + pip install ${RECO_DIR}${EXTRAS}; \ + else \ + pip install recommenders${EXTRAS}@git+https://github.com/recommenders-team/recommenders.git@${GIT_REF}; \ + fi && \ + jupyter notebook --generate-config && \ + echo "c.MultiKernelManager.default_kernel_name = 'Recommenders'" >> /root/.jupyter/jupyter_notebook_config.py && \ + python -m ipykernel install --user --name Recommenders --display-name "Python (Recommenders)" + +# Activate Recommenders Conda environment +ENV PS1='(Recommenders) \[\]\[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\u@\h:\w\$ \[\]' +ENV PATH="/root/conda/envs/Recommenders/bin:/root/conda/condabin:${PATH}" +ENV CONDA_SHLVL='1' +ENV CONDA_PROMPT_MODIFIER='(Recommenders) ' +ENV CONDA_PREFIX="/root/conda/envs/Recommenders" +ENV CONDA_EXE="/root/conda/bin/conda" +ENV CONDA_PYTHON_EXE="/root/conda/bin/python" +ENV JAVA_HOME="/root/conda/envs/Recommenders/lib/jvm" +ENV JAVA_LD_LIBRARY_PATH="${JAVA_HOME}/lib/server" + +# Setup Jupyter notebook EXPOSE 8888 -CMD ["jupyter", "notebook"] +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--ServerApp.allow_origin='*'", "--IdentityProvider.token=''"] diff --git a/tools/docker/README.md b/tools/docker/README.md index adf5997a7..9ce38a00a 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -1,92 +1,116 @@ Docker Support ============== -The Dockerfile in this directory will build Docker images with all the dependencies and code needed to run example notebooks or unit tests included in this repository. +The Dockerfile in this directory will build Docker images with all +the dependencies and code needed to run example notebooks or unit +tests included in this repository. It is also used by +* [.devcontainer/devcontainer.json](../../.devcontainer/devcontainer.json) + to build + [VS Code Dev Contianers](https://code.visualstudio.com/docs/devcontainers/containers) + that can facilitate the development of Recommenders + (See [Setup Guide](../../SETUP.md)), +* and [tests/ci/azureml_tests/aml_utils.py](../../tests/ci/azureml_tests/aml_utils.py) + to create the environment in [the testing workflows of Recommenders](../../.github/workflows/) (See [Tests](../../tests/README.md)). + +Multiple environments are supported by using +[multistage builds](https://docs.docker.com/build/building/multi-stage/). +The following examples show how to build and run the Docker image for +CPU, PySpark, and GPU environments. + +Once the container is running you can access Jupyter notebooks at +http://localhost:8888. -Multiple environments are supported by using [multistage builds](https://docs.docker.com/develop/develop-images/multistage-build/). In order to efficiently build the Docker images in this way, [Docker BuildKit](https://docs.docker.com/develop/develop-images/build_enhancements/) is necessary. -The following examples show how to build and run the Docker image for CPU, PySpark, and GPU environments. - -Note: On some platforms, one needs to manually specify the environment variable for `DOCKER_BUILDKIT`to make sure the build runs well. For example, on a Windows machine, this can be done by the powershell command as below, before building the image -``` -$env:DOCKER_BUILDKIT=1 -``` - -Warning: On some platforms using Docker Buildkit interferes with Anaconda environment installation. If you find that the docker build is hanging during Anaconda environment setup stage try building the container without Buildkit enabled. - -Once the container is running you can access Jupyter notebooks at http://localhost:8888. Building and Running with Docker -------------------------------- -See examples below for the case of conda. If you use venv or virtualenv instead, replace `--build-arg VIRTUAL_ENV=conda` with `--build-arg VIRTUAL_ENV=venv` or `--build-arg VIRTUAL_ENV=virtualenv`, respectively. -
-CPU environment - -``` -DOCKER_BUILDKIT=1 docker build -t recommenders:cpu --build-arg ENV=cpu --build-arg VIRTUAL_ENV=conda . -docker run -p 8888:8888 -d recommenders:cpu -``` +* **CPU environment** -
+ ```bash + docker build -t recommenders:cpu . + docker run -v ../../examples:/root/examples -p 8888:8888 -d recommenders:cpu + ``` -
-PySpark environment -``` -DOCKER_BUILDKIT=1 docker build -t recommenders:pyspark --build-arg ENV=pyspark --build-arg VIRTUAL_ENV=conda . -docker run -p 8888:8888 -d recommenders:pyspark -``` +* **PySpark environment** -
+ ```bash + docker build -t recommenders:pyspark --build-arg EXTRAS=[spark] . + docker run -v ../../examples:/root/examples -p 8888:8888 -d recommenders:pyspark + ``` -
-GPU environment +* **GPU environment** -``` -DOCKER_BUILDKIT=1 docker build -t recommenders:gpu --build-arg ENV=gpu --build-arg VIRTUAL_ENV=conda . -docker run --runtime=nvidia -p 8888:8888 -d recommenders:gpu -``` + ```bash + docker build -t recommenders:gpu --build-arg COMPUTE=gpu . + docker run --runtime=nvidia -v ../../examples:/root/examples -p 8888:8888 -d recommenders:gpu + ``` -
-
-GPU + PySpark environment +* **GPU + PySpark environment** -``` -DOCKER_BUILDKIT=1 docker build -t recommenders:full --build-arg ENV=full --build-arg VIRTUAL_ENV=conda . -docker run --runtime=nvidia -p 8888:8888 -d recommenders:full -``` + ```bash + docker build -t recommenders:gpu-pyspark --build-arg COMPUTE=gpu --build-arg EXTRAS=[gpu,spark] . + docker run --runtime=nvidia -v ../../examples:/root/examples -p 8888:8888 -d recommenders:gpu-pyspark + ``` -
Build Arguments --------------- -There are several build arguments which can change how the image is built. Similar to the `ENV` build argument these are specified during the docker build command. +There are several build arguments which can change how the image is +built. Similar to the `ENV` build argument these are specified during +the docker build command. Build Arg|Description| ---------|-----------| -ENV|Environment to use, options: cpu, pyspark, gpu, full (defaults to cpu)| -VIRTUAL_ENV|Virtual environment to use; mandatory argument, must be one of "conda", "venv", "virtualenv"| -ANACONDA|Anaconda installation script (defaults to miniconda3 4.6.14)| +`COMPUTE`|Compute to use, options: `cpu`, `gpu` (defaults to `cpu`)| +`EXTRAS`|Extra dependencies to use, options: `dev`, `gpu`, `spark` (defaults to none ("")); For example, `[gpu,spark]`| +`GIT_REF`|Git ref of Recommenders to install, options: `main`, `staging`, etc (defaults to `main`); Empty value means editable installation of current clone| +`JDK_VERSION`|OpenJDK version to use (defaults to `21`)| +`PYTHON_VERSION`|Python version to use (defaults to `3.11`)| +`RECO_DIR`|Path to the copy of Recommenders in the container when `GIT_REF` is empty (defaults to `/root/Recommenders`)| + +Examples: +* Install Python 3.10 and the Recommenders package from the staging branch. + + ```bash + docker build -t recommenders:staging --build-arg GIT_REF=staging --build-arg PYTHON_VERSION=3.10 . + ``` + +* Install the current local clone of Recommenders and its extra 'dev' dependencies. -Example: + ```bash + # Go to the root directory of Recommenders to copy the local clone into the Docker image + cd ../../ + docker build -t recommenders:dev --build-arg GIT_REF= --build-arg EXTRAS=[dev] -f tools/docker/Dockerfile . + ``` -``` -DOCKER_BUILDKIT=1 docker build -t recommenders:cpu --build-arg ENV=cpu --build-arg VIRTUAL_ENV=conda . -``` +In order to see detailed progress you can provide a flag during the +build command: ```--progress=plain``` -In order to see detailed progress with BuildKit you can provide a flag during the build command: ```--progress=plain``` -Running tests with docker +Running tests with Docker ------------------------- -To run the tests using e.g. the CPU image, do the following: -``` -docker run -it recommenders:cpu bash -c 'pip install pytest; \ -pip install pytest-cov; \ -pip install pytest-mock; \ -apt-get install -y git; \ -git clone https://github.com/recommenders-team/recommenders.git; \ -cd recommenders; \ -pytest tests/unit -m "not spark and not gpu and not notebooks and not experimental"' -``` \ No newline at end of file +* Run the tests using the `recommenders:cpu` image built above. + NOTE: The `recommender:cpu` image only installs the Recommenders + package under [../../recommenders/](../../recommenders/). + + ```bash + docker run -it recommenders:cpu bash -c 'pip install pytest; \ + pip install pytest-cov; \ + pip install pytest-mock; \ + apt-get install -y git; \ + git clone https://github.com/recommenders-team/recommenders.git; \ + cd Recommenders; \ + pytest tests/unit -m "not spark and not gpu and not notebooks and not experimental"' + ``` + +* Run the tests using the `recommenders:dev` image built above. + NOTE: The `recommenders:dev` image has a full copy of your local + Recommenders repository. + + ```bash + docker run -it recommenders:dev bash -c 'cd Recommenders; \ + pytest tests/unit -m "not spark and not gpu and not notebooks and not experimental"' + ``` From ff15c4e47615438faa7be3e02192d33678ba6179 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sun, 17 Nov 2024 07:56:29 +0100 Subject: [PATCH 62/67] Update runner to ubuntu-24.04 Signed-off-by: miguelgfierro --- .github/workflows/azureml-cpu-nightly.yml | 2 +- .github/workflows/azureml-gpu-nightly.yml | 2 +- .github/workflows/azureml-release-pipeline.yml | 2 +- .github/workflows/azureml-spark-nightly.yml | 2 +- .github/workflows/azureml-unit-tests.yml | 2 +- .github/workflows/sarplus.yml | 2 +- .github/workflows/update_documentation.yml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml index 616707f7f..1a147a2ca 100644 --- a/.github/workflows/azureml-cpu-nightly.yml +++ b/.github/workflows/azureml-cpu-nightly.yml @@ -44,7 +44,7 @@ on: jobs: get-test-groups: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Check out repository code uses: actions/checkout@v4 diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml index 23cffda0a..2cfd80d7e 100644 --- a/.github/workflows/azureml-gpu-nightly.yml +++ b/.github/workflows/azureml-gpu-nightly.yml @@ -44,7 +44,7 @@ on: jobs: get-test-groups: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Check out repository code uses: actions/checkout@v4 diff --git a/.github/workflows/azureml-release-pipeline.yml b/.github/workflows/azureml-release-pipeline.yml index 983cce9db..66e48ca48 100644 --- a/.github/workflows/azureml-release-pipeline.yml +++ b/.github/workflows/azureml-release-pipeline.yml @@ -29,7 +29,7 @@ jobs: secrets: inherit create-release: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 needs: [unit-test-workflow, cpu-nightly-workflow, gpu-nightly-workflow, spark-nightly-workflow] steps: - name: Check out repository code diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml index da508ebe4..bb393059c 100644 --- a/.github/workflows/azureml-spark-nightly.yml +++ b/.github/workflows/azureml-spark-nightly.yml @@ -43,7 +43,7 @@ on: jobs: get-test-groups: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Check out repository code uses: actions/checkout@v4 diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index 0f7ed2a18..c346edff3 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -33,7 +33,7 @@ on: jobs: get-test-groups: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Check out repository code uses: actions/checkout@v4 diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 90d03fef6..832249c26 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -36,7 +36,7 @@ jobs: # Test pysarplus with different versions of Python. # Package pysarplus and upload as GitHub workflow artifact when merged into # the main branch. - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 strategy: matrix: python-version: ["3.8", "3.9", "3.10", "3.11"] diff --git a/.github/workflows/update_documentation.yml b/.github/workflows/update_documentation.yml index 30e2eadf1..bcac8bd2e 100644 --- a/.github/workflows/update_documentation.yml +++ b/.github/workflows/update_documentation.yml @@ -12,7 +12,7 @@ on: jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - name: Checkout repository From 2f2cf910510a1e85a6ed753e66014c2a99e98be1 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Sun, 17 Nov 2024 08:02:07 +0100 Subject: [PATCH 63/67] Timeout update Signed-off-by: miguelgfierro --- .github/workflows/azureml-cpu-nightly.yml | 3 ++- .github/workflows/azureml-gpu-nightly.yml | 3 ++- .github/workflows/azureml-spark-nightly.yml | 3 ++- .github/workflows/azureml-unit-tests.yml | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml index 1a147a2ca..2f61b0c92 100644 --- a/.github/workflows/azureml-cpu-nightly.yml +++ b/.github/workflows/azureml-cpu-nightly.yml @@ -63,7 +63,8 @@ jobs: execute-tests: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 + timeout-minutes: 120 permissions: id-token: write # This is required for requesting the JWT strategy: diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml index 2cfd80d7e..b69d7fb9f 100644 --- a/.github/workflows/azureml-gpu-nightly.yml +++ b/.github/workflows/azureml-gpu-nightly.yml @@ -63,7 +63,8 @@ jobs: execute-tests: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 + timeout-minutes: 120 permissions: id-token: write # This is required for requesting the JWT strategy: diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml index bb393059c..7f9987d15 100644 --- a/.github/workflows/azureml-spark-nightly.yml +++ b/.github/workflows/azureml-spark-nightly.yml @@ -62,7 +62,8 @@ jobs: execute-tests: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 + timeout-minutes: 120 permissions: id-token: write # This is required for requesting the JWT strategy: diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index c346edff3..9766d8831 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -52,7 +52,8 @@ jobs: execute-tests: needs: get-test-groups name: ${{ join(matrix.*, ', ') }} - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 + timeout-minutes: 120 permissions: id-token: write # This is required for requesting the JWT strategy: From 53b5d53bb85a9948b2dfd113b87e182c630b12e0 Mon Sep 17 00:00:00 2001 From: Simon Zhao Date: Mon, 2 Dec 2024 23:12:07 +0800 Subject: [PATCH 64/67] Install sbt for sarplus tests (#2192) Signed-off-by: Simon Zhao --- .github/workflows/sarplus.yml | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 832249c26..5ae935696 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -8,7 +8,7 @@ # * GitHub Actions workflow templates # + [python package](https://github.com/actions/starter-workflows/blob/main/ci/python-package.yml) # + [scala](https://github.com/actions/starter-workflows/blob/main/ci/scala.yml) -# * [GitHub hosted runner - Ubuntu 20.04 LTS](https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md) +# * [GitHub hosted runner - Ubuntu 24.04 LTS](https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md) # * [Azure Databricks runtime releases](https://docs.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases) # * [Azure Synapse Analytics runtimes](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-version-support) @@ -53,6 +53,14 @@ jobs: python -m pip install -U build cibuildwheel pip twine python -m pip install -U flake8 pytest pytest-cov scikit-learn + # Install sbt + # See https://github.com/yokra9/akka-http-example/pull/119/files + echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list + echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list + curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add + sudo apt-get update + sudo apt-get install sbt + - name: Lint with flake8 run: | cd "${PYTHON_ROOT}" @@ -133,6 +141,15 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Install sbt + run: | + # See https://github.com/yokra9/akka-http-example/pull/119/files + echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list + echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list + curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add + sudo apt-get update + sudo apt-get install sbt + - name: Test run: | cd "${SCALA_ROOT}" From b15263d270c8f3913a90898de8d294306ce8cf17 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:28:43 +0100 Subject: [PATCH 65/67] Update Tao's user (#2199) Signed-off-by: miguelgfierro Co-authored-by: miguelgfierro --- .github/CODEOWNERS | 2 +- AUTHORS.md | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 71e970f23..55fe145a4 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,5 @@ # See https://help.github.com/articles/about-codeowners/ # for more info about CODEOWNERS file -* @miguelgfierro @gramhagen @anargyri @loomlike @wutaomsft @simonyansenzhao +* @miguelgfierro @gramhagen @anargyri @loomlike @wav8k @simonyansenzhao diff --git a/AUTHORS.md b/AUTHORS.md index b70bfa644..da4113c74 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -32,14 +32,13 @@ They have admin access to the repo and provide support reviewing issues and pull * **[Miguel González-Fierro](https://github.com/miguelfierro)** * Recommendation algorithms review, development and optimization. * Reco utils review, development and optimization. - * Github statistics. * Continuous integration build / test setup. * **[Scott Graham](https://github.com/gramhagen)** * Improving documentation * VW notebook * **[Simon Zhao](https://github.com/simonyansenzhao)** * SARplus algorithm upgrade -* **[Tao Wu](https://github.com/wutaomsft)** +* **[Tao Wu](https://github.com/wav8k)** * Improving documentation From 7166fceaefaadc1db0d59b666007877f213c0f76 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:29:05 +0100 Subject: [PATCH 66/67] Execute tests when there are changes in the pipelines (#2197) Signed-off-by: miguelgfierro Co-authored-by: miguelgfierro --- .github/workflows/azureml-unit-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index 9766d8831..b79ff2b79 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -19,6 +19,7 @@ on: - tests/** - '!tests/**/*.md' - setup.py + - .github/** types: [opened, synchronize, reopened, ready_for_review] # Enable manual trigger From 90b180ad40e922d548231bb70805e35833bfefd3 Mon Sep 17 00:00:00 2001 From: Miguel Fierro <3491412+miguelgfierro@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:29:24 +0100 Subject: [PATCH 67/67] Prepare for release (#2200) * Recommenders 1.2.1 Signed-off-by: miguelgfierro * update readmes Signed-off-by: miguelgfierro * updating the date --------- Signed-off-by: miguelgfierro Co-authored-by: miguelgfierro --- NEWS.md | 6 ++++++ README.md | 6 +++--- recommenders/__init__.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9d8b1aeb6..40347161f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,12 @@ Licensed under the MIT License. # What's New +## Update December 23, 2024 + +We have a new release [Recommenders 1.2.1](https://github.com/recommenders-team/recommenders/releases/tag/1.2.1)! + +We fixed a lot of bugs due to dependencies, improved security, reviewed the notebooks and the libraries. + ## Update May 2, 2024 We have a new release [Recommenders 1.2.0](https://github.com/microsoft/recommenders/releases/tag/1.2.0)! diff --git a/README.md b/README.md index 7f78b29f8..a56c1a0d0 100644 --- a/README.md +++ b/README.md @@ -15,11 +15,11 @@ Licensed under the MIT License.
-## What's New (May, 2024) +## What's New (Dec, 2024) -We have a new release [Recommenders 1.2.0](https://github.com/recommenders-team/recommenders/releases/tag/1.2.0)! +We have a new release [Recommenders 1.2.1](https://github.com/recommenders-team/recommenders/releases/tag/1.2.1)! -So many changes since our last release. We have full tests on Python 3.8 to 3.11 (around 1800 tests), upgraded performance in many algorithms, reviewed notebooks, and many more improvements. +We fixed a lot of bugs due to dependencies, improved security, reviewed the notebooks and the libraries. ## Introduction diff --git a/recommenders/__init__.py b/recommenders/__init__.py index 87998b029..a4d40c8f6 100644 --- a/recommenders/__init__.py +++ b/recommenders/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. __title__ = "Recommenders" -__version__ = "1.2.0" +__version__ = "1.2.1" __author__ = "Recommenders contributors" __license__ = "MIT" __copyright__ = "Copyright 2018-present Recommenders contributors."