CanDIG · daisieh · Feb 2, 2024 · Sep 15, 2023 · Sep 18, 2023 · Sep 18, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -28,7 +28,6 @@ RUN apk add --no-cache \
 	curl \
 	curl-dev \
 	yaml-dev \
-	libressl-dev \
 	pcre-dev \
 	git \
 	sqlite
@@ -41,6 +40,10 @@ COPY . /app/query_server
 
 WORKDIR /app/query_server
 
+RUN chown -R candig:candig /app/query_server
+
+USER candig
+
 RUN touch initial_setup
 
 ENTRYPOINT ["bash", "entrypoint.sh"]
diff --git a/docker-compose.yml b/docker-compose.yml
diff --git a/query_server/openapi.yaml b/query_server/openapi.yaml
@@ -37,11 +37,12 @@ paths:
                 - $ref: "#/components/parameters/geneParam"
                 - $ref: "#/components/parameters/assemblyParam"
                 - $ref: "#/components/parameters/pageSizeParam"
-                - $ref: "#/components/parameters/pageNumParam"
+                - $ref: "#/components/parameters/pageParam"
+                - $ref: "#/components/parameters/excludeCohortsParam"
             operationId: query_operations.query
             responses:
                 200:
-                    description: Retrieve info about the Query service
+                    description: Retrieved donor information
                     content:
                         application/json:
                             schema:
@@ -52,6 +53,21 @@ paths:
                     $ref: "#/components/responses/404NotFoundError"
                 5XX:
                     $ref: "#/components/responses/5xxServerError"
+    /genomic_completeness:
+        get:
+            summary: Retrieve summary statistics on genomic data
+            description: Retrieve summary statistics on genomic data
+            operationId: query_operations.genomic_completeness
+            responses:
+                200:
+                    description: Retrieved genomic completness information
+                    content:
+                        application/json:
+                            schema:
+                                $ref: '#/components/schemas/GenomicCompletenessBody'
+                5XX:
+                    $ref: "#/components/responses/5xxServerError"
+
 components:
     parameters:
         treatmentParam:
@@ -118,22 +134,29 @@ components:
             example: hg38
             schema:
                 $ref: '#/components/schemas/Field'
+        excludeCohortsParam:
+            in: query
+            name: exclude_cohorts
+            description: A list of cohorts that will be excluded from results
+            example: SYNTHETIC-1
+            schema:
+                $ref: '#/components/schemas/Fields'
         pageSizeParam:
             in: query
             name: page_size
             description: The number of donors to return per page
             example: 10
             required: false
             schema:
-                $ref: '#/components/schemas/Field'
+                $ref: '#/components/schemas/IntField'
         pageParam:
             in: query
             name: page
             description: The page number to grab
             example: 1
             required: false
             schema:
-                $ref: '#/components/schemas/Field'
+                $ref: '#/components/schemas/IntField'
 #        sessionIDParam:
 #            in: query
 #            name: session_id
@@ -152,6 +175,9 @@ components:
         Field:
             type: string
             description: Acceptable requested string for querying
+        IntField:
+            type: integer
+            description: Acceptable integer for querying
         QueryBody:
             type: object
             description: Query response
@@ -168,6 +194,13 @@ components:
                 prev:
                     type: string
                     description: URL to grab the previous set of results
+        GenomicCompletenessBody:
+            type: object
+            description: Genomic completeness statistics
+            properties:
+                results:
+                    type: object
+                    description: Summary statistics of program id (key) to another object with number of complete genomic and transcriptome cases
         # ERROR SCHEMAS
         Error:
             type: object

diff --git a/query_server/query_operations.py b/query_server/query_operations.py
@@ -44,9 +44,8 @@ def get_donors_from_katsu(url, param_name, parameter_list):
             'page_size': PAGE_SIZE
         }
         treatments = requests.get(f"{url}?{urllib.parse.urlencode(parameters)}", headers=request.headers)
-        results = safe_get_request_json(treatments, f'Katsu {param_name}')['results']
+        results = safe_get_request_json(treatments, f'Katsu {param_name}')['items']
         permissible_donors |= set([result['submitter_donor_id'] for result in results])
-    print(permissible_donors)
     return permissible_donors
 
 def add_or_increment(dict, key):
@@ -59,17 +58,31 @@ def get_summary_stats(donors, headers):
     # Perform (and cache) summary statistics
     diagnoses = requests.get(f"{config.KATSU_URL}/v2/authorized/primary_diagnoses/?page_size={PAGE_SIZE}",
         headers=headers)
-    diagnoses = safe_get_request_json(diagnoses, 'Katsu diagnoses')['results']
+    diagnoses = safe_get_request_json(diagnoses, 'Katsu diagnoses')['items']
     # This search is inefficient O(m*n)
     # Should find a better way (Preferably SQL again)
-    donor_ids = [donor['submitter_donor_id'] for donor in donors]
-    donor_date_of_births = [donor['date_of_birth'] for donor in donors]
+    donor_date_of_births = {}
+    for donor in donors:
+        donor_date_of_births[donor['submitter_donor_id']] = donor['date_of_birth']
     age_at_diagnosis = {}
     for diagnosis in diagnoses:
-        if diagnosis['submitter_donor_id'] in donor_ids:
-            donor_idx = donor_ids.index(diagnosis['submitter_donor_id'])
+        if diagnosis['submitter_donor_id'] in donor_date_of_births:
+            # Make sure we have both dates necessary for this analysis
+            if 'date_of_diagnosis' not in diagnosis or diagnosis['date_of_diagnosis'] is None:
+                print(f"Unable to find diagnosis date for {diagnosis['submitter_donor_id']}")
+                add_or_increment(age_at_diagnosis, 'Unknown')
+                continue
+            if diagnosis['submitter_donor_id'] not in donor_date_of_births or donor_date_of_births[diagnosis['submitter_donor_id']] is None:
+                print(f"Unable to find date of birth for {diagnosis['submitter_donor_id']}")
+                add_or_increment(age_at_diagnosis, 'Unknown')
+                continue
+
             diag_date = diagnosis['date_of_diagnosis'].split('-')
-            birth_date = donor_date_of_births[donor_idx].split('-')
+            birth_date = donor_date_of_births[diagnosis['submitter_donor_id']].split('-')
+            if len(diag_date) < 2 or len(birth_date) < 2:
+                print(f"Unable to find date of birth/diagnosis for {diagnosis['submitter_donor_id']}")
+                add_or_increment(age_at_diagnosis, 'Unknown')
+                continue
 
             age = int(diag_date[0]) - int(birth_date[0])
             if int(diag_date[1]) >= int(birth_date[1]):
@@ -86,11 +99,11 @@ def get_summary_stats(donors, headers):
     # http://candig.docker.internal:8008/v2/authorized/treatments/
     treatments = requests.get(f"{config.KATSU_URL}/v2/authorized/treatments/?page_size={PAGE_SIZE}",
         headers=headers)
-    treatments = safe_get_request_json(treatments, 'Katsu treatments')['results']
+    treatments = safe_get_request_json(treatments, 'Katsu treatments')['items']
     treatment_type_count = {}
     for treatment in treatments:
         # This search is inefficient O(m*n)
-        if treatment['submitter_donor_id'] in donor_ids:
+        if treatment['submitter_donor_id'] in donor_date_of_births:
             for treatment_type in treatment['treatment_type']:
                 add_or_increment(treatment_type_count, treatment_type)
 
@@ -108,7 +121,7 @@ def get_summary_stats(donors, headers):
             patients_per_cohort[program_id] += 1
         else:
             patients_per_cohort[program_id] = 1
-    
+
     return {
         'age_at_diagnosis': age_at_diagnosis,
         'treatment_type_count': treatment_type_count,
@@ -153,8 +166,26 @@ def query_htsget_pos(headers, assembly, chrom, start=0, end=10000000):
         headers=headers,
         json=payload), 'HTSGet position')
 
+# The return value does not like None being used as a key, so this helper function recursively
+# goes through the dictionary provided, and changes all keys to strings
+# NB: This overwrites any keys that were previously not strings, and can cause data deletion
+# if there was two keys e.g. 12 and "12"
+def fix_dicts(to_fix):
+    if isinstance(to_fix, dict):
+        new_dict = {}
+        for key, value in to_fix.items():
+            new_dict[str(key)] = fix_dicts(value)
+        return new_dict
+    elif isinstance(to_fix, list):
+        new_list = []
+        for value in to_fix:
+            new_list.append(fix_dicts(value))
+        return new_list
+    else:
+        return to_fix
+
 @app.route('/query')
-def query(treatment="", primary_site="", chemotherapy="", immunotherapy="", hormone_therapy="", chrom="", gene="", page=0, page_size=10, assembly="hg38", session_id=""):
+def query(treatment="", primary_site="", chemotherapy="", immunotherapy="", hormone_therapy="", chrom="", gene="", page=0, page_size=10, assembly="hg38", exclude_cohorts=[], session_id=""):
     # NB: We're still doing table joins here, which is probably not where we want to do them
     # We're grabbing (and storing in memory) all the donor data in Katsu with the below request
 
@@ -166,7 +197,10 @@ def query(treatment="", primary_site="", chemotherapy="", immunotherapy="", horm
     r = safe_get_request_json(requests.get(f"{url}?{urllib.parse.urlencode(params)}",
         # Reuse their bearer token
         headers=request.headers), 'Katsu Donors')
-    donors = r['results']
+    donors = r['items']
+
+    # Filter on excluded cohorts
+    donors = [donor for donor in donors if donor['program_id'] not in exclude_cohorts]
 
     # Will need to look into how to go about this -- ideally we implement this into the SQL in Katsu's side
     filters = [
@@ -206,6 +240,9 @@ def query(treatment="", primary_site="", chemotherapy="", immunotherapy="", horm
             for response in htsget['response']:
                 genomic_query = response['caseLevelData']
                 for case_data in response['caseLevelData']:
+                    if 'biosampleId' not in case_data:
+                        print(f"Could not parse htsget response for {case_data}")
+                        continue
                     id = case_data['biosampleId'].split('~')
                     if len(id) > 1:
                         case_data['program_id'] = id[0]
@@ -236,14 +273,16 @@ def query(treatment="", primary_site="", chemotherapy="", immunotherapy="", horm
 
     # Determine which part of the filtered donors to send back
     ret_donors = [donor['submitter_donor_id'] for donor in donors[(page*page_size):((page+1)*page_size)]]
+    ret_programs = [donor['program_id'] for donor in donors[(page*page_size):((page+1)*page_size)]]
+    full_data = {'results' : []}
     if len(donors) > 0:
-        params = {
-            'page_size': PAGE_SIZE,
-            'donors': ','.join(ret_donors)
-        }
-        r = requests.get(f"{config.KATSU_URL}/v2/authorized/donor_with_clinical_data/?{urllib.parse.urlencode(params)}",
-            headers=request.headers)
-        full_data = safe_get_request_json(r, 'Katsu donor clinical data')
+        for i, donor_id in enumerate(ret_donors):
+            donor_id_url = urllib.parse.quote(donor_id)
+            program_id_url = urllib.parse.quote(ret_programs[i])
+            print('asdf')
+            r = requests.get(f"{config.KATSU_URL}/v2/authorized/donor_with_clinical_data/program/{program_id_url}/donor/{donor_id_url}",
+                headers=request.headers)
+            full_data['results'].append(safe_get_request_json(r, 'Katsu donor clinical data'))
     else:
         full_data = {'results': []}
     full_data['genomic'] = genomic_query
@@ -255,4 +294,36 @@ def query(treatment="", primary_site="", chemotherapy="", immunotherapy="", horm
     # Add prev and next parameters to the repsonse, appending a session ID.
     # Essentially we want to go session ID -> list of donors
     # and then paginate the list of donors, calling donors_with_clinical_data on each before returning
-    return full_data, 200
+    return fix_dicts(full_data), 200
+
+@app.route('/genomic_completeness')
+def genomic_completeness():
+    params = { 'page_size': PAGE_SIZE }
+    url = f"{config.KATSU_URL}/v2/authorized/sample_registrations/"
+    r = safe_get_request_json(requests.get(f"{url}?{urllib.parse.urlencode(params)}",
+        # Reuse their bearer token
+        headers=request.headers), 'Katsu sample registrations')
+    samples = r['items']
+
+    retVal = {}
+    for sample in samples:
+        program_id = sample['program_id']
+        if program_id not in retVal:
+            retVal[program_id] = { 'genomes': 0, 'transcriptomes': 0, 'all': 0 }
+        sample_id = sample['submitter_sample_id']
+
+        # Check with HTSGet to see whether or not this sample is complete
+        r = requests.get(f"{config.HTSGET_URL}/htsget/v1/samples/{sample_id}",
+            # Reuse their bearer token
+            headers=request.headers)
+        if r.ok:
+            r_json = r.json()
+            retVal[program_id]
+            if len(r_json['genomes']) > 0 and len(r_json['transcriptomes']) > 0:
+                retVal[program_id]['all'] += 1
+            if len(r_json['genomes']) > 0:
+                retVal[program_id]['genomes'] += 1
+            if len(r_json['transcriptomes']) > 0:
+                retVal[program_id]['transcriptomes'] += 1
+
+    return retVal, 200
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,5 @@ Flask==2.2.2
 Flask-Cors==3.0.10
 pytest==7.2.0
 connexion==2.14.1
-uwsgi==2.0.21
+uwsgi==2.0.23
 swagger-ui-bundle==0.0.9