Merge pull request #60 from PRIDE-Archive/searchByKeyWord

Search by key word
PRIDE-Archive · Feb 10, 2025 · 0d8b565 · 0d8b565
2 parents e57c89a + 1d95677
commit 0d8b565
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -58,6 +58,7 @@ Commands:
   get-projects-by-accession       get projects by accession... 
   stream-files-metadata           Stream all files metadata in...
   stream-projects-metadata        Stream all projects metadata...
+  search-projects-by-keywords-and-filters Search all projects by keywords...
 
 ```
 > [!NOTE]
@@ -135,7 +136,7 @@ $ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/folder
 >[!WARNING]
 > To download preivate files, the user should use the same command as downloading a single file by name. The only difference is that the user should provide the username and password. However, protocol in this case is unnecessary as the tool will use the https protocol to download the files. At the moment we only allow this protocol because of the infrastructure of PRIDE private files (read the whitepaper for more information).
 
-## Streamming metadata
+## Streaming metadata
 
 One of the great features of PRIDE and pridepy is the ability to stream metadata of all projects and files. This is useful for users who want to analyze the metadata of all projects and files locally.
 
@@ -156,6 +157,14 @@ Stream the files metadata of a specific project as JSON and write it to a file:
 $ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011
 ```
 
+## Search projects by keywords and filters
+
+Get the Project metadata by keywords and filters
+
+```bash
+$  python -m pridepy.pridepy search-projects-by-keywords-and-filters -f projectTags==Proteometools,organismsPart==Pancreas -k human -sd DESC -sf accession -sf submissionDate
+```
+
 # White paper
 
 A white paper is available at [here](paper/paper.md). We can build it as PDF using pandoc.

diff --git a/pridepy/pridepy.py b/pridepy/pridepy.py
@@ -126,7 +126,7 @@ def download_all_public_raw_files(
     "--category",
     required=True,
     help="Category of the files to be downloaded",
-    type=click.Choice("RAW,PEAK,SEARCH,RESULT,SPECTRUM_LIBRARY,OTHER, FASTA".split(",")),
+    type=click.Choice("RAW,PEAK,SEARCH,RESULT,SPECTRUM_LIBRARY,OTHER,FASTA".split(",")),
 )
 def download_all_public_category_files(
     accession: str,
@@ -312,23 +312,71 @@ def stream_files_metadata(accession, output_file):
 @click.option(
     "-k",
     "--keyword",
-    required=False,
-    default="",
+    required=True,
     help="The entered word will be searched among the fields to fetch "
-    "matching pride. The structure of the keyword is : *:*",
+    "matching pride."
+)
+@click.option(
+    "-f",
+    "--filter",
+    required=False,
+    help="Parameters to filter the search results. The structure of the "
+    "filter is: field1==value1, field2==value2. Example "
+    "accession==PRD000001",
+)
+@click.option(
+    "-ps",
+    "--page_size",
+    required=False,
+    default=100,
+    type=click.IntRange(min=1, max=1000),
+    help="Number of results to fetch in a page",
+)
+@click.option(
+    "-p",
+    "--page",
+    required=False,
+    default=0,
+    type=click.IntRange(min=0),
+    help="Identifies which page of results to fetch",
+)
+@click.option(
+    "-sd",
+    "--sort_direction",
+    required=False,
+    default="DESC",
+    help="Sorting direction: ASC or DESC",
+)
+@click.option(
+    "-sf",
+    "--sort_fields",
+    required=False,
+    default=["submission_date"],
+    multiple=True,
+    help="Field(s) for sorting the results on. Default for this "
+    "request is submission_date. More fields can be separated by "
+    "comma and passed. Example: submissionDate,accession",
+    type=click.Choice("accession,submissionDate,diseases,organismsPart,organisms,instruments,softwares,"
+                      "avgDownloadsPerFile,downloadCount,publicationDate".split(",")),
 )
 def search_projects_by_keywords_and_filters(
-    keyword, filter, page_size, page, date_gap, sort_direction, sort_fields
+    keyword, filter, page_size, page, sort_direction, sort_fields
 ):
     """
-    TODO: @selva this function and command line should be reimplemented.
-    TODO: The idea is that the user can type a keyword or keywords and filters and get all the files projects in
-    TODO: JSON. Please remember to update the README.
+    Search all projects by keywords and filters
+    Parameters:
+        keyword (str): keyword to search in entire project.
+        filter (str): filter the search results. field1==value1
+        page_size (int): no of records or projects per page
+        page (int): Page number
+        sort_direction (str): sort direction of the results based on sortfield
+        sort_fields (str): field to sort the results by.
     """
     project = Project()
+    sf = ', '.join(sort_fields)
     logging.info(
         project.search_by_keywords_and_filters(
-            keyword, filter, page_size, page, sort_direction, sort_fields
+            keyword, filter, page_size, page, sort_direction, sf
         )
     )