From 7f6e4333bf62323749153a1bf126872eeefa29c9 Mon Sep 17 00:00:00 2001
From: Sai Shanmukha <nss10@outlook.com>
Date: Wed, 26 Feb 2025 22:56:42 -0600
Subject: [PATCH] Temporarily adding stress-testing library. `Do not merge with
 master`

---
 stress_testing/expansion.py    | 6044 ++++++++++++++++++++++++++++++++
 stress_testing/sd_stress.ipynb |  287 ++
 2 files changed, 6331 insertions(+)
 create mode 100644 stress_testing/expansion.py
 create mode 100644 stress_testing/sd_stress.ipynb

diff --git a/stress_testing/expansion.py b/stress_testing/expansion.py
new file mode 100644
index 00000000..3ed7fb2e
--- /dev/null
+++ b/stress_testing/expansion.py
@@ -0,0 +1,6044 @@
+import requests, json, fnmatch, os, os.path, sys, subprocess, glob, ntpath, copy, re, operator, statistics, datetime, hashlib, uuid
+import pandas as pd
+from os import path
+from pandas import json_normalize
+from collections import Counter
+from statistics import mean
+from io import StringIO
+from IPython.utils import io
+from itertools import cycle
+import random
+from random import randrange
+from pathlib import Path
+
+import time
+import numpy as np
+import scipy
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+class Gen3Error(Exception):
+    pass
+
+
+class Gen3Expansion:
+    """Advanced scripts for interacting with the Gen3 submission, query and index APIs
+
+    Supports advanced data submission and exporting from Sheepdog.
+    Supports paginated GraphQL queries through Peregrine.
+    Supports Flat Model (ElasticSearch) queries through Arranger/Guppy.
+    Supports Indexd queries.
+    Supports user authentication queries.
+
+    Args:
+        endpoint (str): The URL of the data commons.
+        auth_provider (Gen3Auth): A Gen3Auth class instance.
+
+    Examples:
+        This generates the Gen3Expansion class pointed at the sandbox commons while
+        using the credentials.json downloaded from the commons profile page.
+
+        >>> endpoint = "https://nci-crdc-demo.datacommons.io"
+        ... auth = Gen3Auth(endpoint, refresh_file="credentials.json")
+        ... exp = Gen3Expansion(endpoint, auth)
+
+    """
+
+    def __init__(self, endpoint, auth_provider, submission):
+        self._auth_provider = auth_provider
+        self._endpoint = endpoint
+        self.sub = submission  # submission is Gen3Submission(endpoint, auth_provider)
+
+    def __export_file(self, filename, output):
+        """Writes text, e.g., an API response, to a file.
+        Args:
+            filename (str): The name of the file to be created.
+            output (str): The contents of the file to be created.
+        Example:
+        >>> output = requests.get(api_url, auth=self._auth_provider).text
+        ... self.__export_file(filename, output)
+        """
+        outfile = open(filename, "w")
+        outfile.write(output)
+        outfile.close
+        print("Output written to file: " + filename + "\n")
+
+    ### AWS S3 Tools:
+    def s3_ls(self, path, bucket, profile, pattern="*"):
+        """Print the results of an `aws s3 ls` command"""
+        s3_path = bucket + path
+        cmd = ["aws", "s3", "ls", s3_path, "--profile", profile]
+        try:
+            output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode(
+                "UTF-8"
+            )
+        except Exception as e:
+            output = e.output.decode("UTF-8")
+            print("ERROR:" + output)
+        psearch = output.split("\n")
+        if pattern != "*":
+            pmatch = fnmatch.filter(
+                psearch, pattern
+            )  # if default '*', all files will match
+            return arrayTable(pmatch)
+        else:
+            return output
+
+    def s3_files(self, path, bucket, profile, pattern="*", verbose=True):
+        """Get a list of files returned by an `aws s3 ls` command"""
+        s3_path = bucket + path
+        cmd = ["aws", "s3", "ls", s3_path, "--profile", profile]
+        try:
+            output = subprocess.check_output(
+                cmd, stderr=subprocess.STDOUT, shell=True
+            ).decode("UTF-8")
+        except Exception as e:
+            output = e.output.decode("UTF-8")
+            print("ERROR:" + output)
+        output = [line.split() for line in output.split("\n")]
+        output = [
+            line for line in output if len(line) == 4
+        ]  # filter output for lines with file info
+        output = [line[3] for line in output]  # grab the filename only
+        output = fnmatch.filter(output, pattern)  # if default '*', all files will match
+        if verbose == True:
+            print("\nIndex \t Filename")
+            for i, item in enumerate(output, start=0):
+                print(i, "\t", item)
+        return output
+
+    def get_s3_files(self, path, bucket, profile, files=None, mydir=None):
+        """Transfer data from object storage to the VM in the private subnet"""
+
+        # Set the path to the directory where files reside
+        s3_path = bucket + path
+
+        # Create folder on VM for downloaded files
+        if not isinstance(mydir, str):
+            mydir = path
+        if not os.path.exists(mydir):
+            os.makedirs(mydir)
+
+        # If files is an array of filenames, download them
+        if isinstance(files, list):
+            print("Getting files...")
+            for filename in files:
+                s3_filepath = s3_path + str(filename)
+                if os.path.exists(mydir + str(filename)):
+                    print("File " + filename + " already downloaded in that location.")
+                else:
+                    print(s3_filepath)
+                    cmd = ["aws", "s3", "--profile", profile, "cp", s3_filepath, mydir]
+                    try:
+                        output = subprocess.check_output(
+                            cmd, stderr=subprocess.STDOUT, shell=True
+                        ).decode("UTF-8")
+                    except Exception as e:
+                        output = e.output.decode("UTF-8")
+                        print("ERROR:" + output)
+        # If files == None, which syncs the s3_path 'directory'
+        else:
+            print("Syncing directory " + s3_path)
+            cmd = ["aws", "s3", "--profile", profile, "sync", s3_path, mydir]
+            try:
+                output = subprocess.check_output(
+                    cmd, stderr=subprocess.STDOUT, shell=True
+                ).decode("UTF-8")
+            except Exception as e:
+                output = e.output.decode("UTF-8")
+                print("ERROR:" + output)
+        print("Finished")
+
+    # Functions for downloading metadata in TSVs
+
+    def get_project_ids(self, node=None, name=None):
+        """Get a list of project_ids you have access to in a data commons.
+
+        Args:
+            node(str): The node you want projects to have at least one record in.
+            name(str): The name of the programs to get projects in, or the submitter_id of a particular record.
+
+        Example:
+            get_project_ids()
+            get_project_ids(node='demographic')
+            get_project_ids(node='program',name=['training','internal'])
+            get_project_ids(node='case',name='case-01')
+        """
+        project_ids = []
+        queries = []
+        # Return all project_ids in the data commons if no node is provided or if node is program but no name provided
+        if name == None and ((node == None) or (node == "program")):
+            print(
+                "Getting all project_ids you have access to in {}".format(
+                    self._endpoint
+                )
+            )
+            if node == "program":
+                print(
+                    "Specify a list of program names (name = ['myprogram1','myprogram2']) to get only project_ids in particular programs."
+                )
+            queries.append("""{project (first:0){project_id}}""")
+        elif name != None and node == "program":
+            if isinstance(name, list):
+                print(
+                    "Getting all project_ids in the programs '" + ",".join(name) + "'"
+                )
+                for program_name in name:
+                    queries.append(
+                        """{project (first:0, with_path_to:{type:"program",name:"%s"}){project_id}}"""
+                        % (program_name)
+                    )
+            elif isinstance(name, str):
+                print("Getting all project_ids in the program '" + name + "'")
+                queries.append(
+                    """{project (first:0, with_path_to:{type:"program",name:"%s"}){project_id}}"""
+                    % (name)
+                )
+        elif isinstance(node, str) and isinstance(name, str):
+            print(
+                "Getting all project_ids for projects with a path to record '"
+                + name
+                + "' in node '"
+                + node
+                + "'"
+            )
+            queries.append(
+                """{project (first:0, with_path_to:{type:"%s",submitter_id:"%s"}){project_id}}"""
+                % (node, name)
+            )
+        elif isinstance(node, str) and name == None:
+            print(
+                "Getting all project_ids for projects with at least one record in the node '"
+                + node
+                + "'"
+            )
+            query = """{node (first:0,of_type:"%s"){project_id}}""" % (node)
+            df = pd.json_normalize(self.sub.query(query)["data"]["node"])
+            project_ids = project_ids + list(set(df["project_id"]))
+        if len(queries) > 0:
+            for query in queries:
+                res = self.sub.query(query)
+                df = pd.json_normalize(res["data"]["project"])
+                project_ids = project_ids + list(set(df["project_id"]))
+        my_ids = sorted(project_ids, key=str.lower)
+        print(my_ids)
+        return my_ids
+
+    def get_node_tsvs(
+        self,
+        node,
+        projects=None,
+        overwrite=False,
+        remove_empty=True,
+        outdir="node_tsvs",
+    ):
+        """Gets a TSV of the structuerd data from particular node for each project specified.
+           Also creates a master TSV of merged data from each project for the specified node.
+           Returns a DataFrame containing the merged data for the specified node.
+
+        Args:
+            node (str): The name of the node to download structured data from.
+            projects (list): The projects to download the node from. If "None", downloads data from each project user has access to.
+
+        Example:
+        >>> df = get_node_tsvs('demographic')
+
+        """
+
+        if not os.path.exists(outdir):
+            os.makedirs(outdir)
+        mydir = "{}/{}_tsvs".format(outdir, node)
+        if not os.path.exists(mydir):
+            os.makedirs(mydir)
+
+        if projects == None:  # if no projects specified, get node for all projects
+            projects = list(
+                json_normalize(
+                    self.sub.query("""{project (first:0){project_id}}""")["data"][
+                        "project"
+                    ]
+                )["project_id"]
+            )
+        elif isinstance(projects, str):
+            projects = [projects]
+
+        dfs = []
+        df_len = 0
+        for project in projects:
+            filename = str(mydir + "/" + project + "_" + node + ".tsv")
+            if (os.path.isfile(filename)) and (overwrite == False):
+                print("File previously downloaded.")
+            else:
+                prog, proj = project.split("-", 1)
+                self.sub.export_node(prog, proj, node, "tsv", filename)
+            df1 = pd.read_csv(filename, sep="\t", header=0, index_col=False)
+            df_len += len(df1)
+            if not df1.empty:
+                dfs.append(df1)
+
+            print(filename + " has " + str(len(df1)) + " records.")
+
+            if remove_empty == True:
+                if df1.empty:
+                    print("Removing empty file: " + filename)
+                    cmd = ["rm", filename]  # look in the download directory
+                    try:
+                        output = subprocess.check_output(
+                            cmd, stderr=subprocess.STDOUT
+                        ).decode("UTF-8")
+                    except Exception as e:
+                        output = e.output.decode("UTF-8")
+                        print("ERROR deleting file: " + output)
+
+        all_data = pd.concat(dfs, ignore_index=True, sort=False)
+        print("length of all dfs: " + str(df_len))
+        nodefile = str("master_" + node + ".tsv")
+        all_data.to_csv(str(mydir + "/" + nodefile), sep="\t", index=False)
+        print(
+            "Master node TSV with "
+            + str(len(all_data))
+            + " total records written to "
+            + nodefile
+            + "."
+        )
+        return all_data
+
+    def get_project_tsvs(
+        self,
+        projects=None,
+        nodes=None,
+        outdir="project_tsvs",
+        overwrite=False,
+        save_empty=False,
+        remove_nodes=["program", "project", "root", "data_release"],
+    ):
+        """Function gets a TSV for every node in a specified project.
+            Exports TSV files into a directory "project_tsvs/".
+            Function returns a list of the contents of the directory.
+        Args:
+            projects (str/list): The project_id(s) of the project(s) to download. Can be a single project_id or a list of project_ids.
+            nodes(str/list): The nodes to download from each project. If None, will try to download all nodes in the data model.
+            overwrite (boolean): If False, the TSV file != downloaded if there is an existing file with the same name.
+            save_empty(boolean): If True, TSVs with no records, i.e., downloads an empty TSV template, will be downloaded.
+            remove_nodes(list): A list of nodes in the data model that should not be downloaded per project.
+        Example:
+        >>> get_project_tsvs(projects = ['internal-test'])
+
+        """
+        if nodes == None:
+            nodes = sorted(
+                list(
+                    set(
+                        pd.json_normalize(
+                            self.sub.query("""{_node_type (first:-1) {id}}""")["data"][
+                                "_node_type"
+                            ]
+                        )["id"]
+                    )
+                )
+            )  # get all the 'node_id's in the data model
+        elif isinstance(nodes, str):
+            nodes = [nodes]
+
+        for node in remove_nodes:
+            if node in nodes:
+                nodes.remove(node)
+
+        if projects == None:  # if no projects specified, get node for all projects
+            projects = list(
+                pd.json_normalize(
+                    self.sub.query("""{project (first:0){project_id}}""")["data"][
+                        "project"
+                    ]
+                )["project_id"]
+            )
+        elif isinstance(projects, str):
+            projects = [projects]
+
+        # now = datetime.datetime.now()
+        # date = "{}-{}-{}-{}.{}.{}".format(now.year, now.month, now.day, now.hour, now.minute, now.second)
+
+        for project_id in projects:
+            # mydir = "{}_{}/{}_tsvs".format(outdir, date, project_id)  # create the directory to store TSVs
+            mydir = "{}/{}_tsvs".format(
+                outdir, project_id
+            )  # create the directory to store TSVs
+
+            if not os.path.exists(mydir):
+                os.makedirs(mydir)
+
+            for node in nodes:
+                filename = str(mydir + "/" + project_id + "_" + node + ".tsv")
+                if (os.path.isfile(filename)) and (overwrite == False):
+                    print("\tPreviously downloaded: '{}'".format(filename))
+                else:
+                    query_txt = """{_%s_count (project_id:"%s")}""" % (node, project_id)
+                    res = self.sub.query(
+                        query_txt
+                    )  #  {'data': {'_acknowledgement_count': 0}}
+                    count = res["data"][str("_" + node + "_count")]  # count=int(0)
+                    if count > 0 or save_empty == True:
+                        print(
+                            "\nDownloading {} records in node '{}' of project '{}'.".format(
+                                count, node, project_id
+                            )
+                        )
+                        prog, proj = project_id.split("-", 1)
+                        self.sub.export_node(prog, proj, node, "tsv", filename)
+                    else:
+                        print(
+                            "\t{} records in node '{}' of project '{}'.".format(
+                                count, node, project_id
+                            )
+                        )
+
+        cmd = ["ls", mydir]  # look in the download directory
+        try:
+            output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode(
+                "UTF-8"
+            )
+        except Exception as e:
+            output = "ERROR:" + e.output.decode("UTF-8")
+
+        return output
+
+    # Query Functions
+    def query_counts(
+        self,
+        nodes,
+        project_id=None,
+        chunk_size=2500,
+        format="json",
+        args=None,
+    ):
+        """Function to paginate a query to avoid time-outs.
+        Returns a json of all the records in the node.
+
+        Args:
+            nodes (list): The nodes to get counts for.
+            project_id(str): The project_id to limit the query to. Default == None.
+            chunk_size(int): The number of records to return per query. Default is 10000.
+            args(str): Put graphQL arguments here. For example, 'with_path_to:{type:"case",submitter_id:"case-01"}', etc. Don't enclose in parentheses.
+        Example:
+            exp.query_counts(project_id='Exhale-Tempus',nodes='case')
+        """
+
+        counts = {}
+
+        if isinstance(nodes, str):
+            nodes = [nodes]
+
+        for node in nodes:
+            if project_id != None:
+                program, project = project_id.split("-", 1)
+                if args == None:
+                    query_txt = """{_%s_count (project_id:"%s")}""" % (node, project_id)
+                else:
+                    query_txt = """{_%s_count (project_id:"%s", %s)}""" % (
+                        node,
+                        project_id,
+                        args,
+                    )
+            else:
+                if args == None:
+                    query_txt = """{_%s_count}""" % (node)
+                else:
+                    query_txt = """{_%s_count (%s)}""" % (node, args)
+
+            # First query the node count to get the expected number of results for the requested query:
+
+            try:
+                res = self.sub.query(query_txt)
+                count_name = "_".join(map(str, ["", node, "count"]))
+                qsize = res["data"][count_name]
+                counts[node] = qsize
+            except:
+                print("\n\tQuery to get _{}_count failed! {}".format(node, query_txt))
+
+        return counts
+
+    # Query Functions
+    def paginate_query(
+        self,
+        node,
+        project_id=None,
+        props=["id", "submitter_id"],
+        chunk_size=2500,
+        format="json",
+        args=None,
+    ):
+        """Function to paginate a query to avoid time-outs.
+        Returns a json of all the records in the node.
+
+        Args:
+            node (str): The node to query.
+            project_id (str): The project_id to limit the query to. Default == None.
+            props (list): A list of properties in the node to return.
+            chunk_size (int): The number of records to return per query. Default is 2500.
+            args (str): Put graphQL arguments here. For example, 'with_path_to:{type:"case",submitter_id:"case-01"}', etc. Don't enclose in parentheses.
+        Example:
+            paginate_query('demographic',format='tsv')
+            paginate_query('',args='with_path_to:{type:"case",submitter_id:"case-01"}')
+        """
+
+        if node == "datanode":
+            query_txt = """{ %s (%s) { type } }""" % (node, args)
+            response = self.sub.query(query_txt)
+            if "data" in response:
+                nodes = [record["type"] for record in response["data"]["datanode"]]
+                if len(nodes) > 1:
+                    print(
+                        "\tMultiple files with that file_name exist across multiple nodes:\n\t{}.".format(
+                            nodes
+                        )
+                    )
+                elif len(nodes) == 1:
+                    node = nodes[0]
+                else:
+                    return nodes
+
+        if project_id != None:
+            program, project = project_id.split("-", 1)
+            if args == None:
+                query_txt = """{_%s_count (project_id:"%s")}""" % (node, project_id)
+            else:
+                query_txt = """{_%s_count (project_id:"%s", %s)}""" % (
+                    node,
+                    project_id,
+                    args,
+                )
+        else:
+            if args == None:
+                query_txt = """{_%s_count}""" % (node)
+            else:
+                query_txt = """{_%s_count (%s)}""" % (node, args)
+
+        # First query the node count to get the expected number of results for the requested query:
+
+        try:
+            res = self.sub.query(query_txt)
+            count_name = "_".join(map(str, ["", node, "count"]))
+            qsize = res["data"][count_name]
+            print(
+                "\n\tFound {} records in '{}' node of project '{}'. ".format(
+                    qsize, node, project_id
+                )
+            )
+        except:
+            print("\n\tQuery to get _{}_count failed! {}".format(node, query_txt))
+
+        # Now paginate the actual query:
+        properties = " ".join(map(str, props))
+        offset = 0
+        total = {}
+        total["data"] = {}
+        total["data"][node] = []
+        count = 0
+        while offset < qsize:
+
+            if project_id != None:
+                if args == None:
+                    query_txt = (
+                        """{%s (first: %s, offset: %s, project_id:"%s"){%s}}"""
+                        % (node, chunk_size, offset, project_id, properties)
+                    )
+                else:
+                    query_txt = (
+                        """{%s (first: %s, offset: %s, project_id:"%s", %s){%s}}"""
+                        % (node, chunk_size, offset, project_id, args, properties)
+                    )
+            else:
+                if args == None:
+                    query_txt = """{%s (first: %s, offset: %s){%s}}""" % (
+                        node,
+                        chunk_size,
+                        offset,
+                        properties,
+                    )
+                else:
+                    query_txt = """{%s (first: %s, offset: %s, %s){%s}}""" % (
+                        node,
+                        chunk_size,
+                        offset,
+                        args,
+                        properties,
+                    )
+
+            res = self.sub.query(query_txt)
+            if "data" in res:
+                records = res["data"][node]
+
+                if len(records) < chunk_size:
+                    if qsize == 999999999:
+                        return total
+
+                total["data"][node] += records  # res['data'][node] should be a list
+                offset += chunk_size
+            elif "error" in res:
+                print(res["error"])
+                if chunk_size > 1:
+                    chunk_size = int(chunk_size / 2)
+                    print("Halving chunk_size to: " + str(chunk_size) + ".")
+                else:
+                    print("Query timing out with chunk_size of 1!")
+                    exit(1)
+            else:
+                print("Query Error: " + str(res))
+
+            pct = int((len(total["data"][node]) / qsize) * 100)
+            msg = "\tRecords retrieved: {} of {} ({}%), offset: {}, chunk_size: {}.".format(
+                len(total["data"][node]), qsize, pct, offset, chunk_size
+            )
+            # print(msg)
+            sys.stdout.write("\r" + str(msg).ljust(200, " "))
+
+        if format == "tsv":
+            df = json_normalize(total["data"][node])
+            return df
+        else:
+            return total
+
+    def paginate_query_simple(self, node, props, chunk_size=1000, offset=0):
+        """
+        props (str; space-separated): list properties you want returned
+        """
+        results = []
+        data = range(0, 1000)
+        while len(data) > 0:
+            print("Total: {}".format(len(results)))
+            query_txt = """
+            {
+             %s (first:%s, offset:%s) {
+                %s
+            }
+            }""" % (
+                node,
+                chunk_size,
+                offset,
+                props,
+            )
+
+            res = self.sub.query(query_txt)
+            if "data" not in res or len(res["data"][node]) < 1:
+                print(res)
+            data = res["data"][node]
+            results = results + data
+            offset += chunk_size
+
+        return results
+
+    def paginate_query_new(
+        self,
+        node,
+        project_id=None,
+        props=[],
+        args=None,
+        chunk_size=5000,
+        offset=0,
+        format="json",
+    ):
+        """Function to paginate a query to avoid time-outs.
+        Returns a json of all the records in the node.
+        Args:
+            node (str): The node to query.
+            project_id(str): The project_id to limit the query to. Default == None.
+            props(list): A list of properties in the node to return.
+            args(str): Put graphQL arguments here. For example, 'with_path_to:{type:"case",submitter_id:"case-01"}', etc. Don't enclose in parentheses.
+            chunk_size(int): The number of records to return per query. Default is 10000.
+            offset(int): Return results with an offset; setting offset=10 will skip first 10 records.
+            format(str): 'json' or 'tsv'. If set to 'tsv', function will return DataFrame and create a TSV file from it.
+        Example:
+            paginate_query('demographic')
+        """
+        props = list(set(["id", "submitter_id"] + props))
+        properties = " ".join(map(str, props))
+
+        if project_id != None:
+            outname = "query_{}_{}.tsv".format(project_id, node)
+            if args == None:
+                query_txt = """{%s (first: %s, offset: %s, project_id:"%s"){%s}}""" % (
+                    node,
+                    chunk_size,
+                    offset,
+                    project_id,
+                    properties,
+                )
+            else:
+                query_txt = (
+                    """{%s (first: %s, offset: %s, project_id:"%s", %s){%s}}"""
+                    % (node, chunk_size, offset, project_id, args, properties)
+                )
+        else:
+            outname = "query_{}.tsv".format(node)
+            if args == None:
+                query_txt = """{%s (first: %s, offset: %s){%s}}""" % (
+                    node,
+                    chunk_size,
+                    offset,
+                    properties,
+                )
+            else:
+                query_txt = """{%s (first: %s, offset: %s, %s){%s}}""" % (
+                    node,
+                    chunk_size,
+                    offset,
+                    args,
+                    properties,
+                )
+
+        total = {}
+        total["data"] = {}
+        total["data"][node] = []
+
+        records = list(range(chunk_size))
+        while len(records) == chunk_size:
+
+            res = self.sub.query(query_txt)
+
+            if "data" in res:
+                records = res["data"][node]
+                total["data"][node] += records  # res['data'][node] should be a list
+                offset += chunk_size
+
+            elif "error" in res:
+                print(res["error"])
+                if chunk_size > 1:
+                    chunk_size = int(chunk_size / 2)
+                    print("\tHalving chunk_size to: {}.".format(chunk_size))
+                else:
+                    print("\tQuery timing out with chunk_size of 1!")
+                    exit(1)
+
+            else:
+                print("Query Error: {}".format(res))
+
+            print("\tTotal records retrieved: {}".format(len(total["data"][node])))
+
+        if format == "tsv":
+            df = json_normalize(total["data"][node])
+            df.to_csv(outname, sep="\t", index=False)
+            return df
+        else:
+            return total
+
+    def get_uuids_in_node(self, node, project_id):
+        """
+        This function returns a list of all the UUIDs of records
+        in a particular node of a given project.
+        """
+        program, project = project_id.split("-", 1)
+
+        try:
+            res = self.paginate_query(node, project_id)
+            uuids = [x["id"] for x in res["data"][node]]
+        except:
+            raise Gen3Error(
+                "Failed to get UUIDs in node '"
+                + node
+                + "' of project '"
+                + project_id
+                + "'."
+            )
+
+        return uuids
+
+    def list_project_files(self, project_id):
+        query_txt = (
+            """{datanode(first:-1,project_id: "%s") {type file_name id object_id}}"""
+            % (project_id)
+        )
+        res = self.sub.query(query_txt)
+        if len(res["data"]["datanode"]) == 0:
+            print("Project " + project_id + " has no records in any data_file node.")
+            return None
+        else:
+            df = json_normalize(res["data"]["datanode"])
+            json_normalize(Counter(df["type"]))
+            # guids = df.loc[(df['type'] == node)]['object_id']
+            return df
+
+    def get_uuids_for_submitter_ids(self, sids, node):
+        """
+        Get a list of UUIDs for a provided list of submitter_ids.
+        """
+        uuids = []
+        count = 0
+        for sid in sids:
+            count += 1
+            args = 'submitter_id:"{}"'.format(sid)
+            res = self.paginate_query(node=node, args=args)
+            recs = res["data"][node]
+            if len(recs) == 1:
+                uuids.append(recs[0]["id"])
+            elif len(recs) == 0:
+                print("No data returned for {}:\n\t{}".format(sid, res))
+            print("\t{}/{}".format(count, len(sids)))
+        print(
+            "Finished retrieving {} uuids for {} submitter_ids".format(
+                len(uuids), len(sids)
+            )
+        )
+        return uuids
+
+    def get_records_for_submitter_ids(self, sids, node):
+        """
+        Get a list of UUIDs for a provided list of submitter_ids.
+        # could also use:{node(submitter_id: "xyz") {id project_id}} #
+        """
+        uuids = []
+        pids = []
+        count = 0
+        for sid in sids:
+            count += 1
+            args = 'submitter_id:"{}"'.format(sid)
+            res = self.paginate_query(
+                node=node, args=args, props=["id", "submitter_id", "project_id"]
+            )
+            recs = res["data"][node]
+            if len(recs) == 1:
+                uuids.append(recs[0]["id"])
+                pids.append(recs[0]["project_id"])
+            elif len(recs) == 0:
+                print("No data returned for {}:\n\t{}".format(sid, res))
+            print("\t{}/{}".format(count, len(sids)))
+        print(
+            "Finished retrieving {} uuids for {} submitter_ids".format(
+                len(uuids), len(sids)
+            )
+        )
+        df = pd.DataFrame({"project_id": pids, "uuid": uuids, "submitter_id": sids})
+
+        dfs = []
+        for i in range(len(df)):
+            sid = df.iloc[i]["submitter_id"]
+            pid = df.iloc[i]["project_id"]
+            uuid = df.iloc[i]["uuid"]
+            prog, proj = pid.split("-", 1)
+            print("({}/{}): {}".format(i + 1, len(df), uuid))
+            mydir = "project_uuids/{}_tsvs".format(
+                pid
+            )  # create the directory to store TSVs
+            if not os.path.exists(mydir):
+                os.makedirs(mydir)
+            filename = "{}/{}_{}.tsv".format(mydir, pid, uuid)
+            if os.path.isfile(filename):
+                print("File previously downloaded.")
+            else:
+                self.sub.export_record(prog, proj, uuid, "tsv", filename)
+            df1 = pd.read_csv(filename, sep="\t", header=0)
+            dfs.append(df1)
+        all_data = pd.concat(dfs, ignore_index=True)
+        master = "master_uuids_{}.tsv".format(node)
+        all_data.to_csv("{}".format(master), sep="\t", index=False)
+        print(
+            "Master node TSV with {} total recs written to {}.".format(
+                len(all_data), master
+            )
+        )
+        return all_data
+
+    def delete_records(self, uuids, project_id, chunk_size=200, backup=False):
+        """
+        This function attempts to delete a list of UUIDs from a project.
+        It returns a dictionary with a list of successfully deleted UUIDs,
+        a list of those that failed, all the API responses, and all the error messages.
+
+        Args:
+            uuids(list): A list of the UUIDs to delete.
+            project_id(str): The project to delete the IDs from.
+            chunk_size(int): The number of records to delete in each API request.
+            backup(str): If provided, deleted records are backed up to this filename.
+        Example:
+            delete_records(project_id=project_id,uuids=uuids,chunk_size=200)
+        """
+        program, project = project_id.split("-", 1)
+
+        if isinstance(uuids, str):
+            uuids = [uuids]
+
+        if not isinstance(uuids, list):
+            raise Gen3Error(
+                "Please provide a list of UUID(s) to delete with the 'uuid' argument."
+            )
+
+        if backup:
+            ext = backup.split(".")[-1]
+            fname = ".".join(backup.split(".")[0:-1])
+            count = 0
+            while path.exists(backup):
+                count += 1
+                backup = "{}_{}.{}".format(fname, count, ext)
+
+            count = 0
+            print(
+                "Attempting to backup {} records to delete to file '{}'.".format(
+                    len(uuids), backup
+                )
+            )
+
+            records = []
+            for uuid in uuids:
+                count += 1
+                try:
+                    response = self.sub.export_record(
+                        program=program,
+                        project=project,
+                        uuid=uuid,
+                        fileformat="json",
+                        filename=None,
+                    )
+                    record = json.loads(json.dumps(response[0]))
+                    records.append(record)
+                    print(
+                        "\tRetrieving record for UUID '{}' ({}/{}).".format(
+                            uuid, count, len(uuids)
+                        )
+                    )
+                except Exception as e:
+                    print(
+                        "Exception occurred during 'export_record' request: {}.".format(
+                            e
+                        )
+                    )
+                    continue
+
+            with open(backup, "w") as backfile:
+                backfile.write("{}".format(records))
+
+        responses = []
+        errors = []
+        failure = []
+        success = []
+        retry = []
+        tried = []
+        results = {}
+
+        while len(tried) < len(uuids):  # loop sorts all uuids into success or failure
+
+            if len(retry) > 0:
+                print("Retrying deletion of {} valid UUIDs.".format(len(retry)))
+                list_ids = ",".join(retry)
+                retry = []
+            else:
+                list_ids = ",".join(uuids[len(tried) : len(tried) + chunk_size])
+
+            rurl = "{}/api/v0/submission/{}/{}/entities/{}".format(
+                self._endpoint, program, project, list_ids
+            )
+
+            try:
+                # print("\n\trurl='{}'\n".format(rurl)) # trouble-shooting
+                # print("\n\tresp = requests.delete(rurl, auth=auth)")
+                # print("\n\tprint(resp.text)")
+                resp = requests.delete(rurl, auth=self._auth_provider)
+
+            except Exception as e:
+                chunk_size = int(chunk_size / 2)
+                print(
+                    "Exception occurred during delete request:\n\t{}.\n\tReducing chunk_size to '{}'.".format(
+                        e, chunk_size
+                    )
+                )
+                continue
+
+            if (
+                "414 Request-URI Too Large" in resp.text
+                or "service failure" in resp.text
+            ):
+                chunk_size = int(chunk_size / 2)
+                print(
+                    "Service Failure. The chunk_size is too large. Reducing to '{}'".format(
+                        chunk_size
+                    )
+                )
+
+            elif "The requested URL was not found on the server." in resp.text:
+                print(
+                    "\n Requested URL not found on server:\n\t{}\n\t{}".format(
+                        resp, rurl
+                    )
+                )  # debug
+                break
+            else:  # the delete request got an API response
+                # print(resp.text) #trouble-shooting
+                output = json.loads(resp.text)
+                responses.append(output)
+
+                if output["success"]:  # 'success' == True or False in API response
+                    success = list(set(success + [x["id"] for x in output["entities"]]))
+                else:  # if one UUID fails to delete in the request, the entire request fails.
+                    for entity in output["entities"]:
+                        if entity[
+                            "valid"
+                        ]:  # get the valid entities from repsonse to retry.
+                            retry.append(entity["id"])
+                        else:
+                            errors.append(entity["errors"][0]["message"])
+                            failure.append(entity["id"])
+                            failure = list(set(failure))
+                    for error in list(set(errors)):
+                        print(
+                            "Error message for {} records: {}".format(
+                                errors.count(error), error
+                            )
+                        )
+
+            tried = list(set(success + failure))
+            print(
+                "\tProgress: {}/{} (Success: {}, Failure: {}).".format(
+                    len(tried), len(uuids), len(success), len(failure)
+                )
+            )
+
+        # exit the while loop if
+        results["failure"] = failure
+        results["success"] = success
+        results["responses"] = responses
+        results["errors"] = errors
+        print("\tFinished record deletion script.")
+        if len(success) > 0:
+            print("Successfully deleted {} records.".format(len(success)))
+            self.nuked()
+        return results
+
+    def delete_node(self, node, project_id, chunk_size=200):
+        """
+        This function attempts to delete all the records in a particular node of a project.
+        It returns the results of the delete_records function.
+        """
+        try:
+            uuids = self.get_uuids_in_node(node, project_id)
+        except:
+            raise Gen3Error(
+                "Failed to get UUIDs in the node '"
+                + node
+                + "' of project '"
+                + project_id
+                + "'."
+            )
+
+        if len(uuids) != 0:
+            print(
+                "Attemping to delete {} records in the node '{}' of project '{}'.".format(
+                    len(uuids), node, project_id
+                )
+            )
+
+            try:
+                results = self.delete_records(uuids, project_id, chunk_size)
+                print(
+                    "Successfully deleted {} records in the node '{}' of project '{}'.".format(
+                        len(results["success"]), node, project_id
+                    )
+                )
+
+                if len(results["failure"]) > 0:
+                    print(
+                        "Failed to delete {} records. See results['errors'] for the error messages.".format(
+                            len(results["failure"])
+                        )
+                    )
+
+            except:
+                raise Gen3Error(
+                    "Failed to delete UUIDs in the node '{}' of project '{}'.".format(
+                        node, project_id
+                    )
+                )
+
+            return results
+
+    def get_submission_order(
+        self,
+        root_node="project",
+        excluded_schemas=[
+            "_definitions",
+            "_settings",
+            "_terms",
+            "program",
+            "project",
+            "root",
+            "data_release",
+            "metaschema",
+        ],
+    ):
+        """
+        This function gets a data dictionary, and then it determines the submission order of nodes by looking at the links.
+        The reverse of this is the deletion order for deleting projects. (Must delete child nodes before parents).
+        """
+        dd = self.sub.get_dictionary_all()
+        schemas = list(dd)
+        nodes = [k for k in schemas if k not in excluded_schemas]
+        submission_order = [
+            (root_node, 0)
+        ]  # make a list of tuples with (node, order) where order is int
+        while (
+            len(submission_order) < len(nodes) + 1
+        ):  # "root_node" != in "nodes", thus the +1
+            for node in nodes:
+                if (
+                    len([item for item in submission_order if node in item]) == 0
+                ):  # if the node != in submission_order
+                    # print("Node: {}".format(node))
+                    node_links = dd[node]["links"]
+                    parents = []
+                    for link in node_links:
+                        if "target_type" in link:  # node = 'webster_step_second_test'
+                            parents.append(link["target_type"])
+                        elif "subgroup" in link:  # node = 'expression_array_result'
+                            sub_links = link.get("subgroup")
+                            if not isinstance(sub_links, list):
+                                sub_links = [sub_links]
+                            for sub_link in sub_links:
+                                if "target_type" in sub_link:
+                                    parents.append(sub_link["target_type"])
+                    if False in [
+                        i in [i[0] for i in submission_order] for i in parents
+                    ]:
+                        continue  # if any parent != already in submission_order, skip this node for now
+                    else:  # submit this node after the last parent to submit
+                        parents_order = [
+                            item for item in submission_order if item[0] in parents
+                        ]
+                        submission_order.append(
+                            (node, max([item[1] for item in parents_order]) + 1)
+                        )
+        submission_order = sorted(submission_order, key=lambda x: x[1])
+        return submission_order
+
+    def delete_project(
+        self, project_id, root_node="project", chunk_size=200, nuke_project=False
+    ):
+        prog, proj = project_id.split("-", 1)
+        submission_order = self.get_submission_order(root_node=root_node)
+        delete_order = sorted(submission_order, key=lambda x: x[1], reverse=True)
+        nodes = [i[0] for i in delete_order]
+        try:
+            nodes.remove("project")
+        except:
+            print("\n\nNo 'project' node in list of nodes.")
+        for node in nodes:
+            print("\n\tDeleting node '{}' from project '{}'.".format(node, project_id))
+            # data = self.delete_node(
+            #     node=node, project_id=project_id, chunk_size=chunk_size
+            # )
+            self.sub.delete_node(program=prog, project=proj, node_name=node)
+        if nuke_project is True:
+            try:
+                data = self.sub.delete_project(program=prog, project=proj)
+            except Exception as e:
+                print("Couldn't delete project '{}':\n\t{}".format(project_id, e))
+            if "Can not delete the project." in data:
+                print("{}".format(data))
+            else:
+                print("Successfully deleted the project '{}'".format(project_id))
+                self.nuked()
+        else:
+            self.nuked()
+            print(
+                "\n\nSuccessfully deleted all nodes in the project '{}'.\nIf you'd like to delete the project node itself, then add the flag 'nuke_project=True'.".format(
+                    project_id
+                )
+            )
+
+    # Analysis Functions
+    def property_counts_table(self, prop, df):
+        df = df[df[prop].notnull()]
+        counts = Counter(df[prop])
+        df1 = pd.DataFrame.from_dict(counts, orient="index").reset_index()
+        df1 = df1.rename(columns={"index": prop, 0: "count"}).sort_values(
+            by="count", ascending=False
+        )
+        total = sum(df1["count"])
+        df1["percent"] = round(100 * (df1["count"] / total), 1)
+
+        with pd.option_context("display.max_rows", None, "display.max_columns", None):
+            print(df1.to_string(index=False))
+            print("\nTotal Count: {}, Total Categories: {}".format(total, len(df1)))
+
+        return df1
+
+    def property_counts_by_project(self, prop, df):
+        df = df[df[prop].notnull()]
+        categories = list(set(df[prop]))
+        projects = list(set(df["project_id"]))
+
+        project_table = pd.DataFrame(columns=["Project", "Total"] + categories)
+        project_table
+
+        proj_counts = {}
+        for project in projects:
+            cat_counts = {}
+            cat_counts["Project"] = project
+            df1 = df.loc[df["project_id"] == project]
+            total = 0
+            for category in categories:
+                cat_count = len(df1.loc[df1[prop] == category])
+                total += cat_count
+                cat_counts[category] = cat_count
+
+            cat_counts["Total"] = total
+            index = len(project_table)
+            for key in list(cat_counts.keys()):
+                project_table.loc[index, key] = cat_counts[key]
+
+            project_table = project_table.sort_values(
+                by="Total", ascending=False, na_position="first"
+            )
+
+        return project_table
+
+    def plot_categorical_property(self, property, df):
+        # plot a bar graph of categorical variable counts in a dataframe
+        df = df[df[property].notnull()]
+        N = len(df)
+        categories, counts = zip(*Counter(df[property]).items())
+        y_pos = np.arange(len(categories))
+        plt.bar(y_pos, counts, align="center", alpha=0.5)
+        # plt.figtext(.8, .8, 'N = '+str(N))
+        plt.xticks(y_pos, categories)
+        plt.ylabel("Counts")
+        plt.title(str("Counts by " + property + " (N = " + str(N) + ")"))
+        plt.xticks(rotation=90, horizontalalignment="center")
+        # add N for each bar
+        plt.show()
+
+    def plot_numeric_property(self, property, df, by_project=False):
+        # plot a histogram of numeric variable in a dataframe
+        df = df[df[property].notnull()]
+        data = list(df[property].astype(float))
+        N = len(data)
+        fig = sns.distplot(
+            data,
+            hist=False,
+            kde=True,
+            bins=int(180 / 5),
+            color="darkblue",
+            kde_kws={"linewidth": 2},
+        )
+        #        plt.figtext(.8, .8, 'N = '+str(N))
+        plt.xlabel(property)
+        plt.ylabel("Probability")
+        plt.title(
+            "PDF for all projects " + property + " (N = " + str(N) + ")"
+        )  # You can comment this line out if you don't need title
+        plt.show(fig)
+
+        if by_project == True:
+            projects = list(set(df["project_id"]))
+            for project in projects:
+                proj_df = df[df["project_id"] == project]
+                data = list(proj_df[property].astype(float))
+                N = len(data)
+                fig = sns.distplot(
+                    data,
+                    hist=False,
+                    kde=True,
+                    bins=int(180 / 5),
+                    color="darkblue",
+                    kde_kws={"linewidth": 2},
+                )
+                #                plt.figtext(.8, .8, 'N = '+str(N))
+                plt.xlabel(property)
+                plt.ylabel("Probability")
+                plt.title(
+                    "PDF for " + property + " in " + project + " (N = " + str(N) + ")"
+                )  # You can comment this line out if you don't need title
+                plt.show(fig)
+
+    def plot_numeric_property_by_category(
+        self, numeric_property, category_property, df
+    ):
+        # plot a histogram of numeric variable in a dataframe
+        df = df[df[numeric_property].notnull()]
+        data = list(df[numeric_property])
+        N = len(data)
+
+        categories = list(set(df[category_property]))
+        for category in categories:
+            df_2 = df[df[category_property] == category]
+            if len(df_2) != 0:
+                data = list(df_2[numeric_property].astype(float))
+                N = len(data)
+                fig = sns.distplot(
+                    data,
+                    hist=False,
+                    kde=True,
+                    bins=int(180 / 5),
+                    color="darkblue",
+                    kde_kws={"linewidth": 2},
+                )
+                #            plt.figtext(.8, .8, 'N = '+str(N))
+                plt.xlabel(numeric_property)
+                plt.ylabel("Probability")
+                plt.title(
+                    "PDF of "
+                    + numeric_property
+                    + " for "
+                    + category
+                    + " (N = "
+                    + str(N)
+                    + ")"
+                )  # You can comment this line out if you don't need title
+                plt.show(fig)
+
+    def plot_numeric_by_category(self, numeric_property, category_property, df):
+        sns.set(style="darkgrid")
+        categories = list(set(df[category_property]))
+
+        N = 0
+        for category in categories:
+            subset = df[df[category_property] == category]
+            N += len(subset)
+            data = subset[numeric_property].dropna().astype(float)
+            fig = sns.distplot(
+                data,
+                hist=False,
+                kde=True,
+                bins=3,
+                kde_kws={"linewidth": 2},
+                label=category,
+            )
+
+            plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
+
+        plt.title(
+            numeric_property + " by " + category_property + " (N = " + str(N) + ")"
+        )  # You can comment this line out if you don't need title
+        plt.show(fig)
+
+    def plot_category_by_category(self, prop1, prop2, df):
+        sns.set(style="darkgrid")
+        categories, counts = zip(*Counter(df[prop1]).items())
+        N = 0
+
+        for category in categories:
+            subset = df[df[prop1] == category]
+            N += len(subset)
+            data = subset[prop2].dropna().astype(str)
+
+            y_pos = np.arange(len(categories))
+            plt.bar(y_pos, counts, align="center", alpha=0.5)
+            plt.xticks(y_pos, categories)
+            plt.ylabel("Counts")
+            plt.xticks(rotation=90, horizontalalignment="center")
+
+        plt.title("{} by {} (N = {})".format(prop1, prop2, N))
+        plt.show()
+
+    def plot_top10_numeric_by_category(self, numeric_property, category_property, df):
+        sns.set(style="darkgrid")
+        categories = list(set(df[category_property]))
+
+        category_means = {}
+        for category in categories:
+            df_2 = df[df[numeric_property].notnull()]
+            data = list(
+                df_2.loc[df_2[category_property] == category][numeric_property].astype(
+                    float
+                )
+            )
+
+            if len(data) > 5:
+                category_means[category] = mean(data)
+
+        if len(category_means) > 1:
+            sorted_means = sorted(
+                category_means.items(), key=operator.itemgetter(1), reverse=True
+            )[0:10]
+            categories_list = [x[0] for x in sorted_means]
+
+        N = 0
+        for category in categories_list:
+            subset = df[df[category_property] == category]
+            N += len(subset)
+            data = subset[numeric_property].dropna().astype(float)
+            fig = sns.distplot(
+                data,
+                hist=False,
+                kde=True,
+                bins=3,
+                kde_kws={"linewidth": 2},
+                label=category,
+            )
+
+            plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
+
+        plt.title(
+            numeric_property + " by " + category_property + " (N = " + str(N) + ")"
+        )
+        plt.show(fig)
+
+    def plot_numeric_property_by_2_categories(
+        self, numeric_property, category_property, category_property_2, df
+    ):
+
+        df = df[df[numeric_property].notnull()]
+        data = list(df[numeric_property])
+        N = len(data)
+        categories = list(set(df[category_property]))
+
+        for category in categories:
+            df_2 = df[df[category_property] == category]
+            categories_2 = list(
+                set(df_2[category_property_2])
+            )  # This is a list of all compounds tested for each tissue type.
+
+            N = 0
+            for category_2 in categories_2:
+                subset = df_2[df_2[category_property_2] == category_2]
+                N += len(subset)
+                data = subset[numeric_property].dropna().astype(float)
+                fig = sns.distplot(
+                    data,
+                    hist=False,
+                    kde=True,
+                    bins=3,
+                    kde_kws={"linewidth": 2},
+                    label=category_2,
+                )
+
+                plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
+
+            plt.title(
+                numeric_property + " for " + category + " (N = " + str(N) + ")"
+            )  # You can comment this line out if you don't need title
+            plt.show(fig)
+
+    def plot_top10_numeric_property_by_2_categories(
+        self, numeric_property, category_property, category_property_2, df
+    ):
+        df = df[df[numeric_property].notnull()]
+        categories = list(set(df[category_property]))
+
+        for category in categories:
+            df_2 = df[df[category_property] == category]
+            categories_2 = list(
+                set(df_2[category_property_2])
+            )  # This is a list of all category_property_2 values for each category_property value.
+
+            category_2_means = {}
+            for category_2 in categories_2:
+                df_3 = df_2[df_2[numeric_property].notnull()]
+                data = list(
+                    df_3.loc[df_3[category_property_2] == category_2][
+                        numeric_property
+                    ].astype(float)
+                )
+
+                if len(data) > 5:
+                    category_2_means[category_2] = mean(data)
+
+            if len(category_2_means) > 1:
+                sorted_means = sorted(
+                    category_2_means.items(), key=operator.itemgetter(1), reverse=True
+                )[0:10]
+                categories_2_list = [x[0] for x in sorted_means]
+
+                N = 0
+                for category_2 in categories_2_list:
+                    subset = df_2[df_2[category_property_2] == category_2]
+                    N += len(subset)
+                    data = subset[numeric_property].dropna().astype(float)
+                    fig = sns.distplot(
+                        data,
+                        hist=False,
+                        kde=True,
+                        bins=3,
+                        kde_kws={"linewidth": 2},
+                        label=category_2,
+                    )
+
+                    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
+
+                plt.title(
+                    numeric_property + " for " + category + " (N = " + str(N) + ")"
+                )  # You can comment this line out if you don't need title
+                plt.show(fig)
+
+    def node_record_counts(self, project_id):
+        query_txt = """{node (first:-1, project_id:"%s"){type}}""" % (project_id)
+        res = self.sub.query(query_txt)
+        df = json_normalize(res["data"]["node"])
+        counts = Counter(df["type"])
+        df = pd.DataFrame.from_dict(counts, orient="index").reset_index()
+        df = df.rename(columns={"index": "node", 0: "count"})
+        return df
+
+    def get_data_file_tsvs(self, projects=None, remove_empty=True):
+        # Download TSVs for all data file nodes in the specified projects
+        # if no projects specified, get node for all projects
+        if projects == None:
+            projects = list(
+                json_normalize(
+                    self.sub.query("""{project (first:0){project_id}}""")["data"][
+                        "project"
+                    ]
+                )["project_id"]
+            )
+        elif isinstance(projects, str):
+            projects = [projects]
+        # Make a directory for files
+        mydir = "downloaded_data_file_tsvs"
+        if not os.path.exists(mydir):
+            os.makedirs(mydir)
+        # list all data_file 'node_id's in the data model
+        dnodes = list(
+            set(
+                json_normalize(
+                    self.sub.query(
+                        """{_node_type (first:-1,category:"data_file") {id}}"""
+                    )["data"]["_node_type"]
+                )["id"]
+            )
+        )
+        mnodes = list(
+            set(
+                json_normalize(
+                    self.sub.query(
+                        """{_node_type (first:-1,category:"metadata_file") {id}}"""
+                    )["data"]["_node_type"]
+                )["id"]
+            )
+        )
+        inodes = list(
+            set(
+                json_normalize(
+                    self.sub.query(
+                        """{_node_type (first:-1,category:"index_file") {id}}"""
+                    )["data"]["_node_type"]
+                )["id"]
+            )
+        )
+        nodes = list(set(dnodes + mnodes + inodes))
+        # get TSVs and return a master pandas DataFrame with records from every project
+        dfs = []
+        df_len = 0
+        for node in nodes:
+            for project in projects:
+                filename = str(mydir + "/" + project + "_" + node + ".tsv")
+                if os.path.isfile(filename):
+                    print("\n" + filename + " previously downloaded.")
+                else:
+                    prog, proj = project.split("-", 1)
+                    self.sub.export_node(
+                        prog, proj, node, "tsv", filename
+                    )  # use the gen3sdk to download a tsv for the node
+                df1 = pd.read_csv(
+                    filename, sep="\t", header=0
+                )  # read in the downloaded TSV to append to the master (all projects) TSV
+                dfs.append(df1)
+                df_len += len(df1)  # Counting the total number of records in the node
+                print(filename + " has " + str(len(df1)) + " records.")
+                if remove_empty == True:
+                    if df1.empty:
+                        print("Removing empty file: " + filename)
+                        cmd = ["rm", filename]  # look in the download directory
+                        try:
+                            output = subprocess.check_output(
+                                cmd, stderr=subprocess.STDOUT
+                            ).decode("UTF-8")
+                        except Exception as e:
+                            output = e.output.decode("UTF-8")
+                            print("ERROR:" + output)
+            all_data = pd.concat(dfs, ignore_index=True, sort=False)
+            print(
+                "\nlength of all dfs: " + str(df_len)
+            )  # this should match len(all_data) below
+            nodefile = str("master_" + node + ".tsv")
+            all_data.to_csv(str(mydir + "/" + nodefile), sep="\t")
+            print(
+                "Master node TSV with "
+                + str(len(all_data))
+                + " total records written to "
+                + nodefile
+                + "."
+            )  # this should match df_len above
+        return all_data
+
+    def list_guids_in_nodes(self, nodes=None, projects=None):
+        # Get GUIDs for node(s) in project(s)
+        if (
+            nodes == None
+        ):  # get all data_file/metadata_file/index_file 'node_id's in the data model
+            categories = ["data_file", "metadata_file", "index_file"]
+            nodes = []
+            for category in categories:
+                query_txt = """{_node_type (first:-1,category:"%s") {id}}""" % category
+                df = json_normalize(self.sub.query(query_txt)["data"]["_node_type"])
+                if not df.empty:
+                    nodes = list(set(nodes + list(set(df["id"]))))
+        elif isinstance(nodes, str):
+            nodes = [nodes]
+        if projects == None:
+            projects = list(
+                json_normalize(
+                    self.sub.query("""{project (first:0){project_id}}""")["data"][
+                        "project"
+                    ]
+                )["project_id"]
+            )
+        elif isinstance(projects, str):
+            projects = [projects]
+        all_guids = (
+            {}
+        )  # all_guids will be a nested dict: {project_id: {node1:[guids1],node2:[guids2]} }
+        for project in projects:
+            all_guids[project] = {}
+            for node in nodes:
+                guids = []
+                query_txt = (
+                    """{%s (first:-1,project_id:"%s") {project_id file_size file_name object_id id}}"""
+                    % (node, project)
+                )
+                res = self.sub.query(query_txt)
+                if len(res["data"][node]) == 0:
+                    print(project + " has no records in node " + node + ".")
+                    guids = None
+                else:
+                    df = json_normalize(res["data"][node])
+                    guids = list(df["object_id"])
+                    print(
+                        project
+                        + " has "
+                        + str(len(guids))
+                        + " records in node "
+                        + node
+                        + "."
+                    )
+                all_guids[project][node] = guids
+                # nested dict: all_guids[project][node]
+        return all_guids
+
+    def get_access_token(self):
+        """get your temporary access token using your credentials downloaded from the data portal
+        variable <- jsonlite::toJSON(list(api_key = keys$api_key), auto_unbox = TRUE)
+        auth <- POST('https://data.braincommons.org/user/credentials/cdis/access_token', add_headers("Content-Type" = "application/json"), body = variable)
+
+        """
+        access_token = self._auth_provider._get_auth_value()
+        return access_token
+
+    def download_file_endpoint(self, guid=None):
+        """download files by getting a presigned-url from the "/user/data/download/<guid>" endpoint"""
+        if not isinstance(guid, str):
+            raise Gen3Error("Please, supply GUID as string.")
+
+        download_url = "{}/user/data/download/{}".format(self._endpoint, guid)
+        print("Downloading file from '{}'.".format(download_url))
+
+        try:
+            # get the pre-signed URL
+            res = requests.get(
+                download_url, auth=self._auth_provider
+            )  # get the presigned URL
+            file_url = json.loads(res.content)["url"]
+
+            # extract the filename from the pre-signed url
+            f_regex = re.compile(
+                r".*[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}\/(.*)\?.*"
+            )
+            fmatch = f_regex.match(res.text)
+            if fmatch:
+                file_name = fmatch.groups()[0]
+                print("\tSaving downloaded file as '{}'".format(file_name))
+            else:
+                file_name = guid
+                print(
+                    "No matching filename in the response. Saving file with GUID as filename."
+                )
+
+            # get the file and write the contents to the file_name
+            res_file = requests.get(file_url)
+            open("./{}".format(file_name), "wb").write(res_file.content)
+
+        except Exception as e:
+            print("\tFile '{}' failed to download: {}".format(file_name, e))
+
+        return file_name
+
+    def download_files_for_guids(
+        self,
+        guids=None,
+        profile="profile",
+        client="/home/jovyan/.gen3/gen3-client",
+        method="endpoint",
+    ):
+        # Make a directory for files
+        mydir = "downloaded_data_files"
+        file_names = {}
+        if not os.path.exists(mydir):
+            os.makedirs(mydir)
+        if isinstance(guids, str):
+            guids = [guids]
+        if isinstance(guids, list):
+            for guid in guids:
+                if method == "client":
+                    cmd = (
+                        client
+                        + " download-single --filename-format=combined --no-prompt --profile="
+                        + profile
+                        + " --guid="
+                        + guid
+                    )
+                    try:
+                        output = subprocess.check_output(
+                            cmd, stderr=subprocess.STDOUT, shell=True
+                        ).decode("UTF-8")
+                        try:
+                            file_name = re.search(
+                                "Successfully downloaded (.+)\\n", output
+                            ).group(1)
+                            cmd = "mv " + file_name + " " + mydir
+                            try:
+                                output = subprocess.check_output(
+                                    cmd, stderr=subprocess.STDOUT, shell=True
+                                ).decode("UTF-8")
+                            except Exception as e:
+                                output = e.output.decode("UTF-8")
+                                print("ERROR:" + output)
+                        except AttributeError:
+                            file_name = ""  # apply your error handling
+                        print("Successfully downloaded: " + file_name)
+                        file_names[guid] = file_name
+                    except Exception as e:
+                        output = e.output.decode("UTF-8")
+                        print("ERROR:" + output)
+                elif method == "endpoint":
+                    try:
+                        file_name = self.download_file_endpoint(guid=guid)
+                        file_names[guid] = file_name
+                    except Exception as e:
+                        print("Failed to download GUID {}: {}".format(guid, e))
+                else:
+                    print(
+                        "\tPlease set method to either 'endpoint' or 'client'!".format()
+                    )
+        else:
+            print(
+                'Provide a list of guids to download: "get_file_by_guid(guids=guid_list)"'
+            )
+        return file_names
+
+    # file_name = 'GSE63878_final_list_of_normalized_data.txt.gz'
+    # exp.download_file_name(file_name)
+
+    def download_file_name(
+        self,
+        file_name,
+        node="datanode",
+        project_id=None,
+        props=[
+            "type",
+            "file_name",
+            "object_id",
+            "id",
+            "submitter_id",
+            "data_type",
+            "data_format",
+            "data_category",
+        ],
+        all=False,
+    ):
+        """downloads the first file that matches a query for a file_name in a node of a project"""
+        args = 'file_name:"{}"'.format(file_name)
+        response = self.paginate_query(
+            node=node, project_id=project_id, props=props, args=args
+        )  # Use the SDK to send the query and return the response
+
+        if "data" in response:
+            node = list(response["data"])[0]
+            records = response["data"][node]
+
+            if len(records) > 1 and all == False:
+                print(
+                    "\tWARNING - More than one record matched query for '{}' in '{}' node of project '{}'.".format(
+                        file_name, node, project
+                    )
+                )
+                print(
+                    "\t\tDownloading the first file that matched the query:\n{}".format(
+                        data[0]
+                    )
+                )
+
+            if len(records) >= 1 and all == False:
+                record = records[0]
+                guid = record["object_id"]
+                fname = self.download_file_endpoint(guid=guid)
+
+            elif all == True:
+                guids = [record["object_id"] for record in records]
+                for guid in guids:
+                    self.download_file_endpoint(guid=guid)
+
+            return records
+
+        else:
+            print(
+                "There were no records in the query for '{}' in the '{}' node of project_id '{}'".format(
+                    file_name, node, project_id
+                )
+            )
+            return response
+
+    def get_records_for_uuids(self, uuids, project, api):
+        dfs = []
+        for uuid in uuids:
+            # Gen3Submission.export_record("DCF", "CCLE", "d70b41b9-6f90-4714-8420-e043ab8b77b9", "json", filename="DCF-CCLE_one_record.json")
+            # export_record(self, program, project, uuid, fileformat, filename=None)
+            mydir = str(
+                "project_uuids/" + project + "_tsvs"
+            )  # create the directory to store TSVs
+            if not os.path.exists(mydir):
+                os.makedirs(mydir)
+            filename = str(mydir + "/" + project + "_" + uuid + ".tsv")
+            if os.path.isfile(filename):
+                print("File previously downloaded.")
+            else:
+                prog, proj = project.split("-", 1)
+                self.sub.export_record(prog, proj, uuid, "tsv", filename)
+            df1 = pd.read_csv(filename, sep="\t", header=0)
+            dfs.append(df1)
+        all_data = pd.concat(dfs, ignore_index=True)
+        master = str("master_uuids_" + project + ".tsv")
+        all_data.to_csv(str(mydir + "/" + master), sep="\t")
+        print(
+            "Master node TSV with "
+            + str(len(all_data))
+            + " total records written to "
+            + master
+            + "."
+        )
+        return all_data
+
+    def find_duplicate_filenames(self, node, project):
+        # download the node
+        df = get_node_tsvs(node, project, overwrite=True)
+        counts = Counter(df["file_name"])
+        count_df = pd.DataFrame.from_dict(counts, orient="index").reset_index()
+        count_df = count_df.rename(columns={"index": "file_name", 0: "count"})
+        dup_df = count_df.loc[count_df["count"] > 1]
+        dup_files = list(dup_df["file_name"])
+        dups = df[df["file_name"].isin(dup_files)].sort_values(
+            by="md5sum", ascending=False
+        )
+        return dups
+
+    def get_duplicates(self, nodes, projects, api):
+        # Get duplicate SUBMITTER_IDs in a node, which SHOULD NEVER HAPPEN but alas it has, thus this script
+        # if no projects specified, get node for all projects
+        if projects == None:
+            projects = list(
+                json_normalize(
+                    self.sub.query("""{project (first:0){project_id}}""")["data"][
+                        "project"
+                    ]
+                )["project_id"]
+            )
+        elif isinstance(projects, str):
+            projects = [projects]
+
+        # if no nodes specified, get all nodes in data commons
+        if nodes == None:
+            nodes = sorted(
+                list(
+                    set(
+                        json_normalize(
+                            self.sub.query("""{_node_type (first:-1) {id}}""")["data"][
+                                "_node_type"
+                            ]
+                        )["id"]
+                    )
+                )
+            )  # get all the 'node_id's in the data model
+            remove_nodes = [
+                "program",
+                "project",
+                "root",
+                "data_release",
+            ]  # remove these nodes from list of nodes
+            for node in remove_nodes:
+                if node in nodes:
+                    nodes.remove(node)
+        elif isinstance(nodes, str):
+            nodes = [nodes]
+
+        pdups = {}
+        for project_id in projects:
+            pdups[project_id] = {}
+            print("Getting duplicates in project " + project_id)
+            for node in nodes:
+                print("\tChecking " + node + " node")
+                df = paginate_query(
+                    node=node,
+                    project_id=project_id,
+                    props=["id", "submitter_id"],
+                    chunk_size=1000,
+                )
+                if not df.empty:
+                    counts = Counter(df["submitter_id"])
+                    c = pd.DataFrame.from_dict(counts, orient="index").reset_index()
+                    c = c.rename(columns={"index": "submitter_id", 0: "count"})
+                    dupc = c.loc[c["count"] > 1]
+                    if not dupc.empty:
+                        dups = list(set(dupc["submitter_id"]))
+                        uuids = {}
+                        for sid in dups:
+                            uuids[sid] = list(df.loc[df["submitter_id"] == sid]["id"])
+                        pdups[project_id][node] = uuids
+        return pdups
+
+    def delete_duplicates(self, dups, project_id, api):
+
+        if not isinstance(dups, dict):
+            print(
+                "Must provide duplicates as a dictionary of keys:submitter_ids and values:uuids; use get_duplicates function"
+            )
+
+        program, project = project_id.split("-", 1)
+        failure = []
+        success = []
+        results = {}
+        sids = list(dups.keys())
+        total = len(sids)
+        count = 1
+        for sid in sids:
+            while len(dups[sid]) > 1:
+                uuid = dups[sid].pop(1)
+                r = json.loads(self.sub.delete_record(program, project, uuid))
+                if r["code"] == 200:
+                    print(
+                        "Deleted record id ("
+                        + str(count)
+                        + "/"
+                        + str(total)
+                        + "): "
+                        + uuid
+                    )
+                    success.append(uuid)
+                else:
+                    print("Could not deleted record id: " + uuid)
+                    print("API Response: " + r["code"])
+                    failure.append(uuid)
+            results["failure"] = failure
+            results["success"] = success
+            count += 1
+        return results
+
+    def query_records(self, node, project_id, api, chunk_size=500):
+        # Using paginated query, Download all data in a node as a DataFrame and save as TSV
+        schema = self.sub.get_dictionary_node(node)
+        props = list(schema["properties"].keys())
+        links = list(schema["links"])
+        # need to get links out of the list of properties because they're handled differently in the query
+        link_names = []
+        for link in links:
+            link_list = list(link)
+            if "subgroup" in link_list:
+                subgroup = link["subgroup"]
+                for sublink in subgroup:
+                    link_names.append(sublink["name"])
+            else:
+                link_names.append(link["name"])
+        for link in link_names:
+            if link in props:
+                props.remove(link)
+                props.append(str(link + "{id submitter_id}"))
+
+        df = paginate_query(node, project_id, props, chunk_size)
+        outfile = "_".join(project_id, node, "query.tsv")
+        df.to_csv(outfile, sep="\t", index=False, encoding="utf-8")
+        return df
+
+    # Group entities in details into succeeded (successfully created/updated) and failed valid/invalid
+    def summarize_submission(self, tsv, details, write_tsvs):
+        with open(details, "r") as file:
+            f = file.read().rstrip("\n")
+        chunks = f.split("\n\n")
+        invalid = []
+        messages = []
+        valid = []
+        succeeded = []
+        responses = []
+        results = {}
+        chunk_count = 1
+        for chunk in chunks:
+            d = json.loads(chunk)
+            if "code" in d and d["code"] != 200:
+                entities = d["entities"]
+                response = str(
+                    "Chunk "
+                    + str(chunk_count)
+                    + " Failed: "
+                    + str(len(entities))
+                    + " entities."
+                )
+                responses.append(response)
+                for entity in entities:
+                    sid = entity["unique_keys"][0]["submitter_id"]
+                    if entity["valid"]:  # valid but failed
+                        valid.append(sid)
+                    else:  # invalid and failed
+                        message = entity["errors"][0]["message"]
+                        messages.append(message)
+                        invalid.append(sid)
+                        print("Invalid record: {}\n\tmessage: {}".format(sid, message))
+            elif "code" not in d:
+                responses.append("Chunk " + str(chunk_count) + " Timed-Out: " + str(d))
+            else:
+                entities = d["entities"]
+                response = str(
+                    "Chunk "
+                    + str(chunk_count)
+                    + " Succeeded: "
+                    + str(len(entities))
+                    + " entities."
+                )
+                responses.append(response)
+                for entity in entities:
+                    sid = entity["unique_keys"][0]["submitter_id"]
+                    succeeded.append(sid)
+            chunk_count += 1
+        results["valid"] = valid
+        results["invalid"] = invalid
+        results["messages"] = messages
+        results["succeeded"] = succeeded
+        results["responses"] = responses
+        submitted = succeeded + valid + invalid  # 1231 in test data
+        # get records missing in details from the submission.tsv
+        df = pd.read_csv(tsv, sep="\t", header=0)
+        missing_df = df.loc[
+            ~df["submitter_id"].isin(submitted)
+        ]  # these are records that timed-out, 240 in test data
+        missing = list(missing_df["submitter_id"])
+        results["missing"] = missing
+
+        # Find the rows in submitted TSV that are not in either failed or succeeded, 8 time outs in test data, 8*30 = 240 records
+        if write_tsvs == True:
+            print("Writing TSVs: ")
+            valid_df = df.loc[
+                df["submitter_id"].isin(valid)
+            ]  # these are records that weren't successful because they were part of a chunk that failed, but are valid and can be resubmitted without changes
+            invalid_df = df.loc[
+                df["submitter_id"].isin(invalid)
+            ]  # these are records that failed due to being invalid and should be reformatted
+            sub_name = ntpath.basename(tsv)
+            missing_file = "missing_" + sub_name
+            valid_file = "valid_" + sub_name
+            invalid_file = "invalid_" + sub_name
+            missing_df.to_csv(missing_file, sep="\t", index=False, encoding="utf-8")
+            valid_df.to_csv(valid_file, sep="\t", index=False, encoding="utf-8")
+            invalid_df.to_csv(invalid_file, sep="\t", index=False, encoding="utf-8")
+            print("\t" + missing_file)
+            print("\t" + valid_file)
+            print("\t" + invalid_file)
+
+        return results
+
+    def write_tsvs_from_results(self, invalid_ids, filename):
+        # Read the file in as a pandas DataFrame
+        f = os.path.basename(filename)
+        if f.lower().endswith(".csv"):
+            df = pd.read_csv(filename, header=0, sep=",", dtype=str).fillna("")
+        elif f.lower().endswith(".xlsx"):
+            xl = pd.ExcelFile(filename, dtype=str)  # load excel file
+            sheet = xl.sheet_names[0]  # sheetname
+            df = xl.parse(sheet)  # save sheet as dataframe
+            converters = {
+                col: str for col in list(df)
+            }  # make sure int isn't converted to float
+            df = pd.read_excel(filename, converters=converters).fillna("")  # remove nan
+        elif filename.lower().endswith((".tsv", ".txt")):
+            df = pd.read_csv(filename, header=0, sep="\t", dtype=str).fillna("")
+        else:
+            print("Please upload a file in CSV, TSV, or XLSX format.")
+            exit(1)
+
+        invalid_df = df.loc[
+            df["submitter_id"].isin(invalid_ids)
+        ]  # these are records that failed due to being invalid and should be reformatted
+        invalid_file = "invalid_" + f + ".tsv"
+
+        print("Writing TSVs: ")
+        print("\t" + invalid_file)
+        invalid_df.to_csv(invalid_file, sep="\t", index=False, encoding="utf-8")
+
+        return invalid_df
+
+    def submit_df(self, project_id, df, chunk_size=1000, row_offset=0):
+        """Submit data in a pandas DataFrame."""
+        df_type = list(set(df["type"]))
+        df.rename(
+            columns={c: c.lstrip("*") for c in df.columns}, inplace=True
+        )  # remove any leading asterisks in the DataFrame column names
+
+        # Check uniqueness of submitter_ids:
+        if len(list(df.submitter_id)) != len(list(df.submitter_id.unique())):
+            raise Gen3Error(
+                "Warning: file contains duplicate submitter_ids. \nNote: submitter_ids must be unique within a node!"
+            )
+
+        # Chunk the file
+        print("Submitting {} DataFrame with {} records.".format(df_type, len(df)))
+        program, project = project_id.split("-", 1)
+        api_url = "{}/api/v0/submission/{}/{}".format(self._endpoint, program, project)
+        headers = {"content-type": "text/tab-separated-values"}
+
+        start = row_offset
+        end = row_offset + chunk_size
+        chunk = df[start:end]
+
+        count = 0
+
+        results = {
+            "invalid": {},  # these are invalid records
+            "other": [],  # any unhandled API responses
+            "details": [],  # entire API response details
+            "succeeded": [],  # list of submitter_ids that were successfully updated/created
+            "responses": [],  # list of API response codes
+        }
+
+        # Start the chunking loop:
+        while (start + len(chunk)) <= len(df):
+
+            timeout = False
+            valid_but_failed = []
+            invalid = []
+            count += 1
+            print(
+                "\tChunk {} (chunk size: {}, submitted: {} of {})".format(
+                    str(count),
+                    str(chunk_size),
+                    str(len(results["succeeded"]) + len(results["invalid"])),
+                    str(len(df)),
+                )
+            )
+
+            try:
+                response = requests.put(
+                    api_url,
+                    auth=self._auth_provider,
+                    data=chunk.to_csv(sep="\t", index=False),
+                    headers=headers,
+                ).text
+            except requests.exceptions.ConnectionError as e:
+                results["details"].append(e.message)
+
+            # Handle the API response
+            if (
+                "Request Timeout" in response
+                or "413 Request Entity Too Large" in response
+                or "Connection aborted." in response
+                or "service failure - try again later" in response
+            ):  # time-out, response != valid JSON at the moment
+
+                print("\t Reducing Chunk Size: {}".format(response))
+                results["responses"].append("Reducing Chunk Size: {}".format(response))
+                timeout = True
+                time.sleep(20)
+
+            else:
+                try:
+                    json_res = json.loads(response)
+                except Exception as e:
+                    print(response)
+                    print(str(e))
+                    raise Gen3Error("Unable to parse API response as JSON!")
+
+                if "message" in json_res and "code" not in json_res:
+                    print(json_res)  # trouble-shooting
+                    print(
+                        "\t No code in the API response for Chunk {}: {}".format(
+                            str(count), json_res.get("message")
+                        )
+                    )
+                    print("\t {}".format(str(json_res.get("transactional_errors"))))
+                    results["responses"].append(
+                        "Error Chunk {}: {}".format(str(count), json_res.get("message"))
+                    )
+                    results["other"].append(json_res.get("message"))
+
+                elif "code" not in json_res:
+                    print("\t Unhandled API-response: {}".format(response))
+                    results["responses"].append(
+                        "Unhandled API response: {}".format(response)
+                    )
+
+                elif json_res["code"] == 200:  # success
+
+                    entities = json_res.get("entities", [])
+                    print("\t Succeeded: {} entities.".format(str(len(entities))))
+                    results["responses"].append(
+                        "Chunk {} Succeeded: {} entities.".format(
+                            str(count), str(len(entities))
+                        )
+                    )
+
+                    for entity in entities:
+                        sid = entity["unique_keys"][0]["submitter_id"]
+                        results["succeeded"].append(sid)
+
+                elif (
+                    json_res["code"] == 400
+                    or json_res["code"] == 403
+                    or json_res["code"] == 404
+                ):  # failure
+
+                    entities = json_res.get("entities", [])
+                    print("\tChunk Failed: {} entities.".format(str(len(entities))))
+                    results["responses"].append(
+                        "Chunk {} Failed: {} entities.".format(
+                            str(count), str(len(entities))
+                        )
+                    )
+
+                    message = ""
+                    for entity in entities:
+                        sid = entity["unique_keys"][0]["submitter_id"]
+                        if entity["valid"]:  # valid but failed
+                            valid_but_failed.append(sid)
+                        else:  # invalid and failed
+                            message = str(entity["errors"])
+                            results["invalid"][sid] = message
+                            invalid.append(sid)
+                    print(
+                        "\tInvalid records in this chunk: {}, {}".format(
+                            len(invalid), message
+                        )
+                    )
+
+                elif json_res["code"] == 500:  # internal server error
+
+                    print("\t Internal Server Error: {}".format(response))
+                    results["responses"].append(
+                        "Internal Server Error: {}".format(response)
+                    )
+
+            if (
+                len(valid_but_failed) > 0 and len(invalid) > 0
+            ):  # if valid entities failed bc grouped with invalid, retry submission
+                chunk = chunk.loc[
+                    df["submitter_id"].isin(valid_but_failed)
+                ]  # these are records that weren't successful because they were part of a chunk that failed, but are valid and can be resubmitted without changes
+                print(
+                    "Retrying submission of valid entities from failed chunk: {} valid entities.".format(
+                        str(len(chunk))
+                    )
+                )
+
+            elif (
+                len(valid_but_failed) > 0 and len(invalid) == 0
+            ):  # if all entities are valid but submission still failed, probably due to duplicate submitter_ids. Can remove this section once the API response is fixed: https://ctds-planx.atlassian.net/browse/PXP-3065
+                print("\tChunk with error:\n\n{}\n\n".format(chunk))
+                print(
+                    "\tUnhandled API response. Adding chunk to 'other' in results. Check for special characters or malformed links or property values."
+                )
+                results["other"].append(chunk)
+                start += chunk_size
+                end = start + chunk_size
+                chunk = df[start:end]
+
+            elif timeout == False:  # get new chunk if didn't timeout
+                start += chunk_size
+                end = start + chunk_size
+                chunk = df[start:end]
+
+            else:  # if timeout, reduce chunk size and retry smaller chunk
+                if chunk_size >= 2:
+                    chunk_size = int(chunk_size / 2)
+                    end = start + chunk_size
+                    chunk = df[start:end]
+                    print(
+                        "Retrying Chunk with reduced chunk_size: {}".format(
+                            str(chunk_size)
+                        )
+                    )
+                    timeout = False
+                else:
+                    raise Gen3SubmissionError(
+                        "Submission is timing out. Please contact the Helpdesk."
+                    )
+
+        print("Finished data submission.")
+        print("Successful records: {}".format(str(len(set(results["succeeded"])))))
+        print("Failed invalid records: {}".format(str(len(results["invalid"]))))
+
+        return results
+
+    def submit_file(
+        self,
+        project_id,
+        filename,
+        chunk_size=30,
+        row_offset=0,
+        drop_props=["project_id"],
+    ):
+        """Submit data in a spreadsheet file containing multiple records in rows to a Gen3 Data Commons.
+
+        Args:
+            project_id (str): The project_id to submit to.
+            filename (str): The file containing data to submit. The format can be TSV, CSV or XLSX (first worksheet only for now).
+            chunk_size (integer): The number of rows of data to submit for each request to the API.
+            row_offset (integer): The number of rows of data to skip; '0' starts submission from the first row and submits all data.
+
+        Examples:
+            This submits a spreadsheet file containing multiple records in rows to the CCLE project in the sandbox commons.
+
+            >>> Gen3Submission.submit_file("DCF-CCLE","data_spreadsheet.tsv")
+
+        """
+        # Read the file in as a pandas DataFrame
+        f = os.path.basename(filename)
+        if f.lower().endswith(".csv"):
+            df = pd.read_csv(filename, header=0, sep=",", dtype=str).fillna("")
+        elif f.lower().endswith(".xlsx"):
+            xl = pd.ExcelFile(filename, dtype=str)  # load excel file
+            sheet = xl.sheet_names[0]  # sheetname
+            df = xl.parse(sheet)  # save sheet as dataframe
+            converters = {
+                col: str for col in list(df)
+            }  # make sure int isn't converted to float
+            df = pd.read_excel(filename, converters=converters).fillna("")  # remove nan
+        elif filename.lower().endswith((".tsv", ".txt")):
+            df = pd.read_csv(filename, header=0, sep="\t", dtype=str).fillna("")
+        else:
+            raise Gen3Error("Please upload a file in CSV, TSV, or XLSX format.")
+        df.rename(
+            columns={c: c.lstrip("*") for c in df.columns}, inplace=True
+        )  # remove any leading asterisks in the DataFrame column names
+
+        # Check uniqueness of submitter_ids:
+        if len(list(df.submitter_id)) != len(list(df.submitter_id.unique())):
+            raise Gen3Error(
+                "Warning: file contains duplicate submitter_ids. \nNote: submitter_ids must be unique within a node!"
+            )
+
+        if drop_props is not None:
+            if isinstance(drop_props, str):
+                drop_props = [drop_props]
+            elif isinstance(drop_props, list):
+                for prop in drop_props:
+                    if prop in df:
+                        df.drop(columns=[prop], inplace=True)
+            else:
+                print(
+                    "\n\n\tSubmit drop_props argument as a list of properties, e.g.,: drop_props=['id'].\n\n"
+                )
+
+        # Chunk the file
+        print("\nSubmitting {} with {} records.".format(filename, str(len(df))))
+        program, project = project_id.split("-", 1)
+        api_url = "{}/api/v0/submission/{}/{}".format(self._endpoint, program, project)
+        headers = {"content-type": "text/tab-separated-values"}
+
+        start = row_offset
+        end = row_offset + chunk_size
+        chunk = df[start:end]
+
+        count = 0
+
+        results = {
+            "invalid": {},  # these are invalid records
+            "other": [],  # any unhandled API responses
+            "details": [],  # entire API response details
+            "succeeded": [],  # list of submitter_ids that were successfully updated/created
+            "responses": [],  # list of API response codes
+        }
+
+        # Start the chunking loop:
+        while (start + len(chunk)) <= len(df):
+
+            timeout = False
+            valid_but_failed = []
+            invalid = []
+            count += 1
+            print(
+                "Chunk {} (chunk size: {}, submitted: {} of {})".format(
+                    str(count),
+                    str(chunk_size),
+                    str(len(results["succeeded"]) + len(results["invalid"])),
+                    str(len(df)),
+                )
+            )
+
+            try:
+                response = requests.put(
+                    api_url,
+                    auth=self._auth_provider,
+                    data=chunk.to_csv(sep="\t", index=False),
+                    headers=headers,
+                ).text
+            except requests.exceptions.ConnectionError as e:
+                results["details"].append(e.message)
+
+            # Handle the API response
+            if (
+                "Request Timeout" in response
+                or "413 Request Entity Too Large" in response
+                or "Connection aborted." in response
+                or "service failure - try again later" in response
+            ):  # time-out, response != valid JSON at the moment
+
+                print("\t Reducing Chunk Size: {}".format(response))
+                results["responses"].append("Reducing Chunk Size: {}".format(response))
+                timeout = True
+                time.sleep(20)
+
+            else:
+                try:
+                    json_res = json.loads(response)
+                except Exception as e:
+                    print(response)
+                    print(str(e))
+                    raise Gen3Error("Unable to parse API response as JSON!")
+
+                if "message" in json_res and "code" not in json_res:
+                    print(json_res)  # trouble-shooting
+                    print(
+                        "\t No code in the API response for Chunk {}: {}".format(
+                            str(count), json_res.get("message")
+                        )
+                    )
+                    print("\t {}".format(str(json_res.get("transactional_errors"))))
+                    results["responses"].append(
+                        "Error Chunk {}: {}".format(str(count), json_res.get("message"))
+                    )
+                    results["other"].append(json_res.get("message"))
+
+                elif "code" not in json_res:
+                    print("\t Unhandled API-response: {}".format(response))
+                    results["responses"].append(
+                        "Unhandled API response: {}".format(response)
+                    )
+
+                elif json_res["code"] == 200:  # success
+
+                    entities = json_res.get("entities", [])
+                    print("\t Succeeded: {} entities.".format(str(len(entities))))
+                    results["responses"].append(
+                        "Chunk {} Succeeded: {} entities.".format(
+                            str(count), str(len(entities))
+                        )
+                    )
+
+                    for entity in entities:
+                        sid = entity["unique_keys"][0]["submitter_id"]
+                        results["succeeded"].append(sid)
+
+                elif (
+                    json_res["code"] == 400
+                    or json_res["code"] == 403
+                    or json_res["code"] == 404
+                ):  # failure
+
+                    entities = json_res.get("entities", [])
+                    print("\tChunk Failed: {} entities.".format(str(len(entities))))
+                    results["responses"].append(
+                        "Chunk {} Failed: {} entities.".format(
+                            str(count), str(len(entities))
+                        )
+                    )
+
+                    message = ""
+                    for entity in entities:
+                        sid = entity["unique_keys"][0]["submitter_id"]
+                        if entity["valid"]:  # valid but failed
+                            valid_but_failed.append(sid)
+                        else:  # invalid and failed
+                            message = str(entity["errors"])
+                            results["invalid"][sid] = message
+                            invalid.append(sid)
+                    print(
+                        "\tInvalid records in this chunk: {}, {}".format(
+                            len(invalid), message
+                        )
+                    )
+
+                elif json_res["code"] == 500:  # internal server error
+
+                    print("\t Internal Server Error: {}".format(response))
+                    results["responses"].append(
+                        "Internal Server Error: {}".format(response)
+                    )
+
+            if (
+                len(valid_but_failed) > 0 and len(invalid) > 0
+            ):  # if valid entities failed bc grouped with invalid, retry submission
+                chunk = chunk.loc[
+                    df["submitter_id"].isin(valid_but_failed)
+                ]  # these are records that weren't successful because they were part of a chunk that failed, but are valid and can be resubmitted without changes
+                print(
+                    "Retrying submission of valid entities from failed chunk: {} valid entities.".format(
+                        str(len(chunk))
+                    )
+                )
+
+            elif (
+                len(valid_but_failed) > 0 and len(invalid) == 0
+            ):  # if all entities are valid but submission still failed, probably due to duplicate submitter_ids. Can remove this section once the API response is fixed: https://ctds-planx.atlassian.net/browse/PXP-3065
+                # raise Gen3Error(
+                #     "Please check your data for correct file encoding, special characters, or duplicate submitter_ids or ids."
+                # )
+                print(
+                    "\tUnhandled API response. Adding chunk to 'other' in results. Check for special characters or malformed links or property values."
+                )
+                results["other"].append(chunk)
+                start += chunk_size
+                end = start + chunk_size
+                chunk = df[start:end]
+
+            elif timeout == False:  # get new chunk if didn't timeout
+                start += chunk_size
+                end = start + chunk_size
+                chunk = df[start:end]
+
+            else:  # if timeout, reduce chunk size and retry smaller chunk
+                if chunk_size >= 2:
+                    chunk_size = int(chunk_size / 2)
+                    end = start + chunk_size
+                    chunk = df[start:end]
+                    print(
+                        "Retrying Chunk with reduced chunk_size: {}".format(
+                            str(chunk_size)
+                        )
+                    )
+                    timeout = False
+                else:
+                    raise Gen3SubmissionError(
+                        "Submission is timing out. Please contact the Helpdesk."
+                    )
+
+        print("Finished data submission.")
+        print("Successful records: {}".format(str(len(set(results["succeeded"])))))
+        print("Failed invalid records: {}".format(str(len(results["invalid"]))))
+
+        return results
+
+    def submit_file_dry(
+        self,
+        project_id,
+        filename,
+        chunk_size=30,
+        row_offset=0,
+        drop_props=["project_id"],
+    ):
+        """Submit data in a spreadsheet file containing multiple records in rows to a Gen3 Data Commons.
+
+        Args:
+            project_id (str): The project_id to submit to.
+            filename (str): The file containing data to submit. The format can be TSV, CSV or XLSX (first worksheet only for now).
+            chunk_size (integer): The number of rows of data to submit for each request to the API.
+            row_offset (integer): The number of rows of data to skip; '0' starts submission from the first row and submits all data.
+
+        Examples:
+            This submits a spreadsheet file containing multiple records in rows to the CCLE project in the sandbox commons.
+
+            >>> Gen3Submission.submit_file("DCF-CCLE","data_spreadsheet.tsv")
+
+        """
+        # Read the file in as a pandas DataFrame
+        f = os.path.basename(filename)
+        if f.lower().endswith(".csv"):
+            df = pd.read_csv(filename, header=0, sep=",", dtype=str).fillna("")
+        elif f.lower().endswith(".xlsx"):
+            xl = pd.ExcelFile(filename, dtype=str)  # load excel file
+            sheet = xl.sheet_names[0]  # sheetname
+            df = xl.parse(sheet)  # save sheet as dataframe
+            converters = {
+                col: str for col in list(df)
+            }  # make sure int isn't converted to float
+            df = pd.read_excel(filename, converters=converters).fillna("")  # remove nan
+        elif filename.lower().endswith((".tsv", ".txt")):
+            df = pd.read_csv(filename, header=0, sep="\t", dtype=str).fillna("")
+        else:
+            raise Gen3Error("Please upload a file in CSV, TSV, or XLSX format.")
+        df.rename(
+            columns={c: c.lstrip("*") for c in df.columns}, inplace=True
+        )  # remove any leading asterisks in the DataFrame column names
+
+        # Check uniqueness of submitter_ids:
+        if len(list(df.submitter_id)) != len(list(df.submitter_id.unique())):
+            raise Gen3Error(
+                "Warning: file contains duplicate submitter_ids. \nNote: submitter_ids must be unique within a node!"
+            )
+
+        if drop_props is not None:
+            if isinstance(drop_props, str):
+                drop_props = [drop_props]
+            elif isinstance(drop_props, list):
+                for prop in drop_props:
+                    if prop in df:
+                        df.drop(columns=[prop], inplace=True)
+            else:
+                print(
+                    "\n\n\tSubmit drop_props argument as a list of properties, e.g.,: drop_props=['id'].\n\n"
+                )
+
+        # Chunk the file
+        print("\nSubmitting {} with {} records.".format(filename, str(len(df))))
+        program, project = project_id.split("-", 1)
+        api_url = "{}/api/v0/submission/{}/{}/_dry_run".format(
+            self._endpoint, program, project
+        )
+        headers = {"content-type": "text/tab-separated-values"}
+
+        start = row_offset
+        end = row_offset + chunk_size
+        chunk = df[start:end]
+
+        count = 0
+
+        results = {
+            "invalid": {},  # these are invalid records
+            "other": [],  # any unhandled API responses
+            "details": [],  # entire API response details
+            "succeeded": [],  # list of submitter_ids that were successfully updated/created
+            "responses": [],  # list of API response codes
+        }
+
+        # Start the chunking loop:
+        while (start + len(chunk)) <= len(df):
+
+            timeout = False
+            valid_but_failed = []
+            invalid = []
+            count += 1
+            print(
+                "Chunk {} (chunk size: {}, submitted: {} of {})".format(
+                    str(count),
+                    str(chunk_size),
+                    str(len(results["succeeded"]) + len(results["invalid"])),
+                    str(len(df)),
+                )
+            )
+
+            try:
+                response = requests.put(
+                    api_url,
+                    auth=self._auth_provider,
+                    data=chunk.to_csv(sep="\t", index=False),
+                    headers=headers,
+                ).text
+            except requests.exceptions.ConnectionError as e:
+                results["details"].append(e.message)
+
+            # Handle the API response
+            if (
+                "Request Timeout" in response
+                or "413 Request Entity Too Large" in response
+                or "Connection aborted." in response
+                or "service failure - try again later" in response
+            ):  # time-out, response != valid JSON at the moment
+
+                print("\t Reducing Chunk Size: {}".format(response))
+                results["responses"].append("Reducing Chunk Size: {}".format(response))
+                timeout = True
+                time.sleep(20)
+
+            else:
+                try:
+                    json_res = json.loads(response)
+                except Exception as e:
+                    print(response)
+                    print(str(e))
+                    raise Gen3Error("Unable to parse API response as JSON!")
+
+                if "message" in json_res and "code" not in json_res:
+                    print(json_res)  # trouble-shooting
+                    print(
+                        "\t No code in the API response for Chunk {}: {}".format(
+                            str(count), json_res.get("message")
+                        )
+                    )
+                    print("\t {}".format(str(json_res.get("transactional_errors"))))
+                    results["responses"].append(
+                        "Error Chunk {}: {}".format(str(count), json_res.get("message"))
+                    )
+                    results["other"].append(json_res.get("message"))
+
+                elif "code" not in json_res:
+                    print("\t Unhandled API-response: {}".format(response))
+                    results["responses"].append(
+                        "Unhandled API response: {}".format(response)
+                    )
+
+                elif json_res["code"] == 200:  # success
+
+                    entities = json_res.get("entities", [])
+                    print("\t Succeeded: {} entities.".format(str(len(entities))))
+                    results["responses"].append(
+                        "Chunk {} Succeeded: {} entities.".format(
+                            str(count), str(len(entities))
+                        )
+                    )
+
+                    for entity in entities:
+                        sid = entity["unique_keys"][0]["submitter_id"]
+                        results["succeeded"].append(sid)
+
+                elif (
+                    json_res["code"] == 400
+                    or json_res["code"] == 403
+                    or json_res["code"] == 404
+                ):  # failure
+
+                    entities = json_res.get("entities", [])
+                    print("\tChunk Failed: {} entities.".format(str(len(entities))))
+                    results["responses"].append(
+                        "Chunk {} Failed: {} entities.".format(
+                            str(count), str(len(entities))
+                        )
+                    )
+
+                    message = ""
+                    for entity in entities:
+                        sid = entity["unique_keys"][0]["submitter_id"]
+                        if entity["valid"]:  # valid but failed
+                            valid_but_failed.append(sid)
+                        else:  # invalid and failed
+                            message = str(entity["errors"])
+                            results["invalid"][sid] = message
+                            invalid.append(sid)
+                    print(
+                        "\tInvalid records in this chunk: {}, {}".format(
+                            len(invalid), message
+                        )
+                    )
+
+                elif json_res["code"] == 500:  # internal server error
+
+                    print("\t Internal Server Error: {}".format(response))
+                    results["responses"].append(
+                        "Internal Server Error: {}".format(response)
+                    )
+
+            if (
+                len(valid_but_failed) > 0 and len(invalid) > 0
+            ):  # if valid entities failed bc grouped with invalid, retry submission
+                chunk = chunk.loc[
+                    df["submitter_id"].isin(valid_but_failed)
+                ]  # these are records that weren't successful because they were part of a chunk that failed, but are valid and can be resubmitted without changes
+                print(
+                    "Retrying submission of valid entities from failed chunk: {} valid entities.".format(
+                        str(len(chunk))
+                    )
+                )
+
+            elif (
+                len(valid_but_failed) > 0 and len(invalid) == 0
+            ):  # if all entities are valid but submission still failed, probably due to duplicate submitter_ids. Can remove this section once the API response is fixed: https://ctds-planx.atlassian.net/browse/PXP-3065
+                # raise Gen3Error(
+                #     "Please check your data for correct file encoding, special characters, or duplicate submitter_ids or ids."
+                # )
+                print(
+                    "\tUnhandled API response. Adding chunk to 'other' in results. Check for special characters or malformed links or property values."
+                )
+                results["other"].append(chunk)
+                start += chunk_size
+                end = start + chunk_size
+                chunk = df[start:end]
+
+            elif timeout == False:  # get new chunk if didn't timeout
+                start += chunk_size
+                end = start + chunk_size
+                chunk = df[start:end]
+
+            else:  # if timeout, reduce chunk size and retry smaller chunk
+                if chunk_size >= 2:
+                    chunk_size = int(chunk_size / 2)
+                    end = start + chunk_size
+                    chunk = df[start:end]
+                    print(
+                        "Retrying Chunk with reduced chunk_size: {}".format(
+                            str(chunk_size)
+                        )
+                    )
+                    timeout = False
+                else:
+                    raise Gen3SubmissionError(
+                        "Submission is timing out. Please contact the Helpdesk."
+                    )
+
+        print("Finished data submission.")
+        print("Successful records: {}".format(str(len(set(results["succeeded"])))))
+        print("Failed invalid records: {}".format(str(len(results["invalid"]))))
+
+        return results
+
+    # indexd functions:
+    def query_indexd(self, limit=100, page=0, uploader=None, args=None):
+        """Queries indexd with given records limit and page number.
+        For example:
+            records = query_indexd(api='https://icgc.bionimbus.org/',limit=1000,page=0)
+            https://icgc.bionimbus.org/index/index/?limit=1000&page=0
+        """
+        data, records = {}, []
+
+        if uploader == None:
+            index_url = "{}/index/index/?limit={}&page={}".format(
+                self._endpoint, limit, page
+            )
+        else:
+            index_url = "{}/index/index/?limit={}&page={}&uploader={}".format(
+                self._endpoint, limit, page, uploader
+            )
+
+        if args != None:
+            index_url = "{}&{}".format(index_url, args)
+
+        try:
+            response = requests.get(index_url).text
+            data = json.loads(response)
+        except Exception as e:
+            print(
+                "\tUnable to parse indexd response as JSON!\n\t\t{} {}".format(
+                    type(e), e
+                )
+            )
+
+        if "records" in data:
+            records = data["records"]
+        else:
+            print(
+                "\tNo records found in data from '{}':\n\t\t{}".format(index_url, data)
+            )
+
+        return records
+
+    def get_indexd(self, limit=1000, page=0, format="JSON", uploader=None, args=None):
+        """get all the records in indexd
+            api = "https://icgc.bionimbus.org/"
+            args = lambda: None
+            setattr(args, 'api', api)
+            setattr(args, 'limit', 100)
+            page = 0
+
+        Usage:
+            i = exp.get_indexd(format="TSV", uploader=orcid)
+            i = exp.get_indexd(format="TSV", args="authz=/programs/TCIA/projects/COVID-19-AR")
+        """
+        if format in ["JSON", "TSV"]:
+            dc_regex = re.compile(r"https:\/\/(.+)\/?$")
+            dc = dc_regex.match(self._endpoint).groups()[0]
+            dc = dc.strip("/")
+        else:
+            print(
+                "\n\n'{}' != a valid output format. Please provide a format of either 'JSON' or 'TSV'.\n\n".format(
+                    format
+                )
+            )
+
+        stats_url = "{}/index/_stats".format(self._endpoint)
+        try:
+            response = requests.get(stats_url).text
+            stats = json.loads(response)
+            print("Stats for '{}': {}".format(self._endpoint, stats))
+        except Exception as e:
+            print(
+                "\tUnable to parse indexd response as JSON!\n\t\t{} {}".format(
+                    type(e), e
+                )
+            )
+
+        print(
+            "Getting all records in indexd (limit: {}, starting at page: {})".format(
+                limit, page
+            )
+        )
+
+        all_records = []
+
+        done = False
+        while done == False:
+
+            records = self.query_indexd(
+                limit=limit, page=page, uploader=uploader, args=args
+            )
+            all_records.extend(records)
+
+            if len(records) != limit:
+                print(
+                    "\tLength of returned records ({}) does not equal limit ({}).".format(
+                        len(records), limit
+                    )
+                )
+                if len(records) == 0:
+                    done = True
+
+            print(
+                "\tPage {}: {} records ({} total)".format(
+                    page, len(records), len(all_records)
+                )
+            )
+            page += 1
+
+        print(
+            "\t\tScript finished. Total records retrieved: {}".format(len(all_records))
+        )
+
+        now = datetime.datetime.now()
+        date = "{}-{}-{}_{}.{}".format(
+            now.year, now.month, now.day, now.minute, now.second
+        )
+
+        if format == "JSON":
+            outname = "{}_indexd_records_{}.json".format(dc, date)
+            with open(outname, "w") as output:
+                output.write(json.dumps(all_records))
+
+        if format == "TSV":
+            outname = "{}_indexd_records_{}.tsv".format(dc, date)
+            all_records = pd.DataFrame(all_records)
+            all_records["md5sum"] = [hashes.get("md5") for hashes in all_records.hashes]
+            all_records.to_csv(outname, sep="\t", index=False)
+
+        return all_records
+
+    def delete_indexd_records(self, irecs):
+        """
+        Arguments:
+            irecs(list): A list of indexd records. Get with, e.g., function:
+                Gen3Expansion.get_indexd(uploader="cgmeyer@uchicagp.edu")
+        """
+        total, count = len(irecs), 0
+        success, failure = [], []
+        for irec in irecs:
+            count += 1
+            guid = irec["did"]
+            rev = irec["rev"]
+            index_url = "{}/index/index/{}?rev={}".format(self._endpoint, guid, rev)
+            access_token = self.get_token()
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": "bearer {}".format(access_token),
+            }
+            response = requests.delete(index_url, headers=headers)
+            if response.status_code == 200:
+                success.append(guid)
+                print(
+                    "{}/{} {}: Successfully deleted '{}'.".format(
+                        count, total, response.status_code, guid
+                    )
+                )
+            else:
+                failure.append(guid)
+                print(
+                    "{}/{} {}: Failed to delete '{}'.".format(
+                        count, total, response.status_code, guid
+                    )
+                )
+        if len(success) > 0:
+            print("Successfully deleted {} indexd records.".format(len(success)))
+            self.nuked()
+        return {"success": success, "failure": failure}
+
+    def remove_uploader_from_indexd(self, irecs):
+        """
+        Arguments:
+            irecs(list): A list of indexd records. Get with, e.g., function:
+                Gen3Expansion.get_indexd(uploader="cgmeyer@uchicagp.edu")
+        """
+        total, count = len(irecs), 0
+        success, failure = [], []
+        for irec in irecs:
+            count += 1
+            guid = irec["did"]
+            rev = irec["rev"]
+            payload = {"uploader": None}
+            index_url = "{}/index/index/{}?rev={}".format(self._endpoint, guid, rev)
+            access_token = self.get_token()
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": "bearer {}".format(access_token),
+            }
+            response = requests.put(index_url, headers=headers, json=payload)
+            if response.status_code == 200:
+                success.append(guid)
+            else:
+                failure.append(guid)
+            print(
+                "{}/{} {}: {}".format(
+                    count, total, response.status_code, response.text.encode("utf8")
+                )
+            )
+        return {"success": success, "failure": failure}
+
+    def get_urls(self, guids):
+        # Get URLs for a list of GUIDs
+        if isinstance(guids, str):
+            guids = [guids]
+        if isinstance(guids, list):
+            urls = {}
+            for guid in guids:
+                index_url = "{}/index/{}".format(self._endpoint, guid)
+                output = requests.get(index_url, auth=self._auth_provider).text
+                guid_index = json.loads(output)
+                url = guid_index["urls"][0]
+                urls[guid] = url
+        else:
+            print(
+                "Please provide one or a list of data file GUIDs: get_urls\(guids=guid_list\)"
+            )
+        return urls
+
+    def get_guids_for_file_names(self, file_names, method="indexd", match="file_name"):
+        # Get GUIDs for a list of file_names
+        if isinstance(file_names, str):
+            file_names = [file_names]
+        if not isinstance(file_names, list):
+            print(
+                "Please provide one or a list of data file file_names: get_guid_for_filename\(file_names=file_name_list\)"
+            )
+        guids = {}
+        if method == "indexd":
+            count, total = 0, len(file_names)
+            for file_name in file_names:
+                count += 1
+                print(
+                    "({}/{}) Retrieving GUID for '{}'.".format(count, total, file_name)
+                )
+                index_url = "{}/index/index/?file_name={}".format(
+                    self._endpoint, file_name
+                )
+                response = requests.get(index_url, auth=self._auth_provider).text
+                index_record = json.loads(response)
+                if len(index_record["records"]) > 0:
+                    # did = index_record["records"][0]["did"]
+                    dids = [i["did"] for i in index_record["records"]]
+                    guids[file_name] = dids
+                else:
+                    print(
+                        "No records found in indexd with file_name: '{}'!".format(
+                            file_name
+                        )
+                    )
+                    guids[file_name] = np.nan
+        elif method == "sheepdog":
+            for file_name in file_names:
+                if match == "file_name":
+                    args = 'file_name:"{}"'.format(file_name)
+                elif match == "submitter_id":
+                    args = 'submitter_id:"{}"'.format(file_name)
+                props = ["object_id"]
+                res = self.paginate_query(node="datanode", args=args, props=props)
+                recs = res["data"]["datanode"]
+                if len(recs) >= 1:
+                    guid = recs[0]["object_id"]
+                    guids[file_name] = guid
+                else:
+                    print(
+                        "Found no sheepdog records with {}: {}".format(
+                            method, file_name
+                        )
+                    )
+                if len(recs) > 1:
+                    guids = [rec["object_id"] for rec in recs]
+                    guids[file_name] = guids
+                    print(
+                        "Found more than 1 sheepdog record with {}: {}".format(
+                            method, file_name
+                        )
+                    )
+        else:
+            print("Enter a valid method.\n\tValid methods: 'sheepdog','indexd'")
+
+        with open("guids_filenames_map.txt", "w") as guids_map:
+            guids_map.write(json.dumps(guids))
+
+        return guids
+
+    def get_index_for_file_names(self, file_names, format="tsv"):
+        # Get GUIDs for a list of file_names
+        if isinstance(file_names, str):
+            file_names = [file_names]
+        if not isinstance(file_names, list):
+            print(
+                "Please provide one or a list of data file file_names: get_guid_for_filename\(file_names=file_name_list\)"
+            )
+        all_records = []
+        count, total = 0, len(file_names)
+        for file_name in file_names:
+            count += 1
+            print(
+                "\t({}/{})Getting indexd record for {}".format(count, total, file_name)
+            )
+            index_url = "{}/index/index/?file_name={}".format(self._endpoint, file_name)
+            response = requests.get(index_url, auth=self._auth_provider).text
+            try:
+                response = json.loads(response)
+            except Exception as e:
+                print("Error: {}: {}".format(response, e))
+            if "records" in response:
+                records = response["records"]
+                if len(records) == 1:
+                    irec = records[0]
+                else:
+                    print(
+                        "\tMultiple indexd records found for file_name '{}'!\n\t{}\n".format(
+                            file_name, records
+                        )
+                    )
+            else:
+                print("\tNo indexd records found for file_name '{}'!".format(file_name))
+            all_records.append(irec)
+
+        if all_records == None:
+            print("No records in the index with authz {}.".format(authz))
+
+        elif format == "tsv":
+            df = json_normalize(all_records)
+            filename = "indexd_records_for_filenames.tsv"
+            df.to_csv(filename, sep="\t", index=False, encoding="utf-8")
+            return df
+
+        elif format == "guids":
+            guids = []
+            for record in all_records:
+                guids.append(record["did"])
+            return guids
+
+        else:
+            return all_records
+
+        return all_records
+
+    def get_index_for_authz(self, authz, format="tsv", page=0, limit=100):
+        # Get GUIDs for a particular project (authz)
+        # https://data.bloodpac.org/index/index/?authz=/programs/bpa/projects/UAMS_P0001_T1
+        # exp.get_index_for_authz(authz='/programs/bpa/projects/UAMS_P0001_T1')
+
+        if isinstance(authz, list):
+            authz = authz[0]
+
+        all_records, records = [], []
+        done = False
+        while done == False:
+
+            index_url = "{}/index/index/?limit={}&page={}&authz={}".format(
+                self._endpoint, limit, page, authz
+            )
+            # index_url = "{}/index/index/?limit={}&page={}&authz={}".format(api,limit,page,authz)
+            response = requests.get(index_url, auth=self._auth_provider).text
+            # response = requests.get(index_url, auth=auth).text
+            data = json.loads(response)
+            if "records" in data:
+                records = data["records"]
+                all_records.extend(records)
+
+                if len(records) == 0:
+                    done = True
+
+            print(
+                "\tPage {}: {} records ({} total)".format(
+                    page, len(records), len(all_records)
+                )
+            )
+            page += 1
+
+        print(
+            "\t\tScript finished. Total records retrieved: {}".format(len(all_records))
+        )
+
+        # index_url = "{}/index/index/?authz={}".format(self._endpoint, authz)
+        # response = requests.get(index_url, auth=self._auth_provider).text
+        # records = json.loads(response)["records"]
+        # data.append(records)
+        #
+
+        if all_records == None:
+            print("No records in the index with authz {}.".format(authz))
+
+        elif format == "tsv":
+            df = json_normalize(all_records)
+            filename = "indexd_records_for_{}.tsv".format(authz.split("/")[-1])
+            df.to_csv(filename, sep="\t", index=False, encoding="utf-8")
+            return df
+
+        elif format == "guids":
+            guids = []
+            for record in all_records:
+                guids.append(record["did"])
+            return guids
+
+        else:
+            return all_records
+
+        return all_records
+
+    def get_index_for_acl(self, acl, format="guids", page=0, limit=100):
+        # Get GUIDs for a particular project (acl)
+        # https://data.bloodpac.org/index/index/?acl=UAMS_P0001_T1,bpa
+        # exp.get_index_for_acl(acl='UAMS_P0001_T1,bpa')
+
+        if isinstance(acl, list):
+            acl = "{},{}".format(acl[0], acl[1])
+
+        all_records, records = [], []
+        done = False
+        while done == False:
+
+            index_url = "{}/index/index/?limit={}&page={}&acl={}".format(
+                self._endpoint, limit, page, acl
+            )
+            # index_url = "{}/index/index/?limit={}&page={}&acl={}".format(api,limit,page,acl)
+            response = requests.get(index_url, auth=self._auth_provider).text
+            # response = requests.get(index_url, auth=auth).text
+            data = json.loads(response)
+            if "records" in data:
+                records = data["records"]
+                all_records.extend(records)
+
+                if len(records) == 0:
+                    done = True
+
+            print(
+                "\tPage {}: {} records ({} total)".format(
+                    page, len(records), len(all_records)
+                )
+            )
+            page += 1
+
+        print(
+            "\t\tScript finished. Total records retrieved: {}".format(len(all_records))
+        )
+
+        if all_records == None:
+            print("No records in the index with acl {}.".format(acl))
+
+        elif format == "tsv":
+            df = json_normalize(all_records)
+            filename = "indexd_records_for_{}.tsv".format(acl)
+            df.to_csv(filename, sep="\t", index=False, encoding="utf-8")
+            return df
+
+        elif format == "guids":
+            guids = []
+            for record in all_records:
+                guids.append(record["did"])
+            return guids
+
+        else:
+            return all_records
+
+        return all_records
+
+    def get_index_for_url(self, url):
+        """Returns the indexd record for a file's storage location URL ('urls' in indexd)
+        Example:
+            api='https://icgc.bionimbus.org/'
+            url='s3://pcawg-tcga-sarc-us/2720a2b8-3f4e-5b6e-9f74-1067a068462a'
+            exp.get_index_for_url(url=url,api=api)
+        """
+        indexd_endpoint = "{}/index/index/".format(self._endpoint)
+        indexd_query = "{}?url={}".format(indexd_endpoint, url)
+        output = requests.get(indexd_query, auth=self._auth_provider).text
+        response = json.loads(output)
+        index_records = response["records"]
+        return index_records
+
+    def get_index_for_guids(self, guids, chunk_size=2000):
+        """Uses the index/bulk/documents endpoint to fetch all the indexd records for a list of GUIDs"""
+        if isinstance(guids, str):
+            guids = [guids]
+
+        chunk_size = 2000
+        lower, upper = 0, chunk_size
+        all_records = []
+        while lower < len(guids):
+            print(
+                "(Fetching {}/{} GUIDs): fetching {} through {}.".format(
+                    len(all_records), len(guids), lower, upper
+                )
+            )
+            bulk_guids = guids[lower:upper]
+            bulk_endpoint = "{}/index/bulk/documents".format(self._endpoint)
+            response = requests.post(
+                bulk_endpoint, json=bulk_guids, auth=self._auth_provider
+            )
+            print(response)
+            irecs = json.loads(response.text)
+            len(irecs)
+            all_records = all_records + irecs
+            lower += chunk_size
+            upper += chunk_size
+            if lower > len(guids):
+                print(
+                    "Finished. {} total indexd records fetched for {} provided GUIDs.".format(
+                        len(all_records), len(guids)
+                    )
+                )
+
+        now = datetime.datetime.now()
+        date = "{}-{}-{}_{}.{}".format(
+            now.year, now.month, now.day, now.minute, now.second
+        )
+
+        if format == "JSON":
+            outname = "indexd_records_{}.json".format(date)
+            with open(outname, "w") as output:
+                output.write(json.dumps(all_records))
+
+        if format == "TSV":
+            outname = "indexd_records_{}.tsv".format(date)
+            all_records = pd.DataFrame(all_records)
+            all_records["md5sum"] = [hashes.get("md5") for hashes in all_records.hashes]
+            all_records.to_csv(outname, sep="\t", index=False)
+
+        return all_records
+
+    def get_index_for_guids_old(self, guids):
+        """Returns the indexd record for a list of GUIDs ('urls' in indexd)"""
+        if isinstance(guids, str):
+            guids = [guids]
+
+        all_records = []
+        for guid in guids:
+            print(
+                "\tGetting index for GUID ({}/{}): {}".format(
+                    len(all_records), len(guids), guid
+                )
+            )
+            indexd_endpoint = "{}/index/index/".format(self._endpoint)
+            indexd_query = "{}{}".format(indexd_endpoint, guid)
+            response = requests.get(indexd_query, auth=self._auth_provider).text
+            records = json.loads(response)
+            all_records.append(records)
+
+        now = datetime.datetime.now()
+        date = "{}-{}-{}_{}.{}".format(
+            now.year, now.month, now.day, now.minute, now.second
+        )
+
+        if format == "JSON":
+            outname = "indexd_records_{}.json".format(date)
+            with open(outname, "w") as output:
+                output.write(json.dumps(all_records))
+
+        if format == "TSV":
+            outname = "indexd_records_{}.tsv".format(date)
+            all_records = pd.DataFrame(all_records)
+            all_records["md5sum"] = [hashes.get("md5") for hashes in all_records.hashes]
+            all_records.to_csv(outname, sep="\t", index=False)
+
+        return all_records
+
+    # failed = [irec for irec in irecs if irec['size'] == None]
+    # failed_guids = [irec['did'] for irec in failed]
+
+    def get_guid_for_url(self, url):
+        """Return the GUID for a file's URL in indexd
+        Example:
+            api='https://icgc.bionimbus.org/'
+            url='s3://pcawg-tcga-sarc-us/2720a2b8-3f4e-5b6e-9f74-1067a068462a'
+            exp.get_guid_for_url(url=url,api=api)
+        """
+        index_records = self.get_index_for_url(url=url)
+        if len(index_records) == 1:
+            guid = index_records[0]["did"]
+            return guid
+        else:
+            guids = []
+            for index_record in index_records:
+                guids.append(index_record["did"])
+            return guids
+
+    def delete_uploaded_files(self, guids):
+        """
+        DELETE http://petstore.swagger.io/?url=https://raw.githubusercontent.com/uc-cdis/fence/master/openapis/swagger.yaml#/data/delete_data__file_id_
+        Deletes all locations of a stored data file and remove its record from indexd.
+        After a user uploads a data file and it is registered in indexd,
+        but before it is mapped into the graph via metadata submission,
+        this endpoint will delete the file from its storage locations (saved in the record in indexd)
+        and delete the record in indexd.
+
+        Args:
+            guids (list): The list of GUIDs to delete.
+
+        Examples:
+            >>> Gen3Expansion.delete_uploaded_files(guids="dg.7519/fd0d91e0-87a6-4627-80b4-50d98614c560")
+            >>> Gen3Expansion.delete_uploaded_files(guids=["dg.7519/fd0d91e0-87a6-4627-80b4-50d98614c560","dg.7519/bc78b25d-6203-4d5f-9257-cc6bba3fc34f"])
+        """
+        if isinstance(guids, str):
+            guids = [guids]
+
+        if not isinstance(guids, list):
+            raise Gen3Error("Please, supply GUIDs as a list.")
+
+        count, total = 0, len(guids)
+        deleted, failed = [], []
+        for guid in guids:
+            count += 1
+            fence_url = "{}/user/data/".format(self._endpoint)
+
+            try:
+                response = requests.delete(fence_url + guid, auth=self._auth_provider)
+            except requests.exceptions.ConnectionError as e:
+                raise Gen3Error(e)
+
+            if response.status_code == 204:
+                print("({}/{}) Successfully deleted GUID {}".format(count, total, guid))
+                deleted.append(guid)
+            else:
+                print("({}/{}) Error deleting GUID {}:".format(count, total, guid))
+                print(response.reason)
+                failed.append(guid)
+        if len(deleted) > 0:
+            print("Successfully deleted {} uploaded files.".format(len(deleted)))
+            self.nuked()
+        return {"deleted": deleted, "failed": failed}
+
+    # Data commons summary functions
+
+    def t(self, var):
+        vtype = type(var)
+        print(vtype)
+        if vtype in [dict, list]:
+            print("{}".format(list(var)))
+        if vtype in [str, int, float]:
+            print("{}".format(var))
+
+    def create_output_dir(self, outdir="data_summary_reports"):
+        cmd = ["mkdir", "-p", outdir]
+        try:
+            output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode(
+                "UTF-8"
+            )
+        except Exception as e:
+            output = e.output.decode("UTF-8")
+            print("ERROR:" + output)
+        return outdir
+
+    def old_list_links(self, link_list, dd):
+        """return a list of indiv link names."""
+        link_names = []
+        for link in link_list:
+            if "subgroup" in link:
+                sublinks = list(link["subgroup"])
+                for sublink in sublinks:
+                    link_names.append(sublink["name"])
+            else:
+                link_names.append(link["name"])
+        return link_names
+
+    def list_links(self, node, dd):
+        """return a list of indiv link names for a node"""
+        link_list = dd[node]["links"]
+        link_names = []
+        for link in link_list:
+            if "subgroup" in link:
+                sublinks = list(link["subgroup"])
+                for sublink in sublinks:
+                    link_names.append(sublink["name"])
+            else:
+                link_names.append(link["name"])
+        return link_names
+
+    def get_prop_type(self, node, prop, dd):
+        prop_def = dd[node]["properties"][prop]
+        if "type" in prop_def:
+            prop_type = prop_def["type"]
+            if "null" in prop_type:
+                prop_type = [prop for prop in prop_type if prop != "null"][0]
+        elif "enum" in prop_def:
+            prop_type = "enum"
+        elif "oneOf" in prop_def:
+            if "type" in prop_def["oneOf"][0]:
+                prop_type = prop_def["oneOf"][0]["type"]
+            elif "enum" in prop_def["oneOf"][0]:
+                prop_type = "enum"
+        elif "anyOf" in prop_def:
+            if isinstance(prop_def["anyOf"], list):
+                prop_type = [x["type"] for x in prop_def["anyOf"] if "items" in x][0]
+            else:
+                prop_type = prop_def["anyOf"]
+        else:
+            print("Can't get the property type for {}!".format(shared_prop))
+        return prop_type
+
+    def summarize_dd(
+        self,
+        props_to_remove=[
+            "id",
+            "submitter_id",
+            "project_id",
+            "created_datetime",
+            "updated_datetime",
+            "case_submitter_id",
+            "state",
+            "type",
+            "md5sum",
+            "object_id",
+            "file_state",
+            "file_size",
+            "file_name",
+            "projects",
+        ],
+        nodes_to_remove=[
+            "root",
+            "metaschema",
+            "program",
+            "project",
+            "core_metadata_collection",
+        ],
+    ):
+        """Return a dict with nodes and list of properties in each node."""
+        dd = self.sub.get_dictionary_all()
+        nodes = []
+        node_regex = re.compile(
+            r"^[^_][A-Za-z0-9_]+$"
+        )  # don't match _terms,_settings,_definitions, etc.)
+        nodes = list(filter(node_regex.search, list(dd)))
+
+        dds = {}
+        for node in nodes:
+            if node not in nodes_to_remove:
+                print("\n\tnode: {}".format(node))
+                dds[node] = {}
+                dds[node]["title"] = dd[node]["title"].strip()
+                props = list(dd[node]["properties"])
+
+                for prop in props:
+                    if prop not in props_to_remove:
+                        print("\n\tprop: {}".format(prop))
+                        if "description" in dd[node]["properties"][prop]:
+                            dds[node][prop] = dd[node]["properties"][prop][
+                                "description"
+                            ].strip()
+                        else:
+                            dds[node][prop] = prop
+
+        return dds
+
+    def summarize_tsvs(
+        self,
+        tsv_dir,
+        dd,
+        prefix="",
+        outlier_threshold=10,
+        omit_props=[
+            "project_id",
+            "type",
+            "id",
+            "submitter_id",
+            "case_submitter_id",
+            "case_ids",
+            "visit_id",
+            "sample_id",
+            "md5sum",
+            "file_name",
+            "object_id",
+            "series_uid",
+            "study_uid",
+            "token_record_id",
+        ],
+        omit_nodes=["metaschema", "root", "program", "project", "data_release"],
+        outdir=".",
+        bin_limit=False,
+        write_report=True,
+        report_null=True,
+    ):
+        """
+        Returns a summary of TSV data per project, node, and property in the specified directory "tsv_dir".
+        For each property in each project, the total, non-null and null counts are returned.
+        For string, enumeration and boolean properties, bins and the number of unique bins are returned.
+        For integers and numbers, the mean, median, min, max, and stdev are returned.
+        Outliers in numeric data are identified using "+/- stdev". The cut-off for outlier identification can be changed by raising or lowering the outlier_threshold (common setting is ~3).
+
+        Args:
+            tsv_dir(str): project_tsvs directory
+            dd(dict): data dictionary of the commons result of func Gen3Submission.get_dictionary_all()
+            prefix(str): Default gets TSVs from all directories ending in "_tsvs". "prefix" of the project_tsvs directories (e.g., program name of the projects: "Program_1-Project_2_tsvs"). Result of running the Gen3Expansion.get_project_tsvs() function.
+            outlier_threshold(number): The upper/lower threshold for identifying outliers in numeric data is the standard deviation multiplied by this number.
+            omit_props(list): Properties to omit from being summarized. It doesn't make sense to summarize certain properties, e.g., those with all unique values. May want to omit: ['sample_id','specimen_number','current_medical_condition_name','medical_condition_name','imaging_results','medication_name'].
+            omit_nodes(list): Nodes in the data dictionary to omit from being summarized, e.g., program, project, data_release, root and metaschema.
+            outdir(str): A directory for the output files.
+
+        Examples:
+            s = summarize_tsvs(tsv_dir='project_tsvs/',
+                dd=dd)
+        """
+
+        summary = {}
+
+        report = pd.DataFrame(
+            columns=[
+                "prop_id",
+                "project_id",
+                "node",
+                "property",
+                "type",
+                "N",
+                "nn",
+                "null",
+                "perc_null",
+                "all_null",
+                "min",
+                "max",
+                "median",
+                "mean",
+                "stdev",
+                "outliers",
+                "bin_number",
+                "bins",
+            ]
+        )
+        report["all_null"] = report["all_null"].astype(bool)
+
+        dir_pattern = "{}*{}".format(prefix, "tsvs")
+        project_dirs = glob.glob("{}/{}".format(tsv_dir, dir_pattern))
+
+        nn_nodes, nn_props, null_nodes, null_props, all_prop_ids = [], [], [], [], []
+
+        msg = "Summarizing TSVs in '{}':\n".format(tsv_dir)
+        print("\n\n{}".format(msg))
+
+        for project_dir in project_dirs:  # project_dir=project_dirs[0]
+
+            try:
+                project_id = re.search(
+                    r"^{}/?([A-Za-z0-9_-]+)_tsvs$".format(tsv_dir), project_dir
+                ).group(1)
+            except:
+                print(
+                    "Couldn't extract the project_id from project_dir '{}'!".format(
+                        project_dir
+                    )
+                )
+
+            fpattern = "{}*{}".format(prefix, ".tsv")
+            fnames = glob.glob("{}/{}".format(project_dir, fpattern))
+
+            # msg = "\t\tFound the following {} TSVs: {}".format(len(fnames),fnames)
+            # sys.stdout.write("\r" + str(msg))
+
+            # print(fnames) # trouble-shooting
+            if len(fnames) == 0:
+                continue
+
+            for (
+                fname
+            ) in (
+                fnames
+            ):  # Each node with data in the project is in one TSV file so len(fnames) is the number of nodes in the project with data.
+
+                # print("\n\t\t{}".format(fname)) # trouble-shooting
+
+                node_regex = (
+                    re.escape(project_id) + r"_([a-zA-Z0-9_]+)\.tsv$"
+                )  # node = re.search(r'^([a-zA-Z0-9_]+)-([a-zA-Z0-9]+)_([a-zA-Z0-9_]+)\.tsv$',fname).group(3)
+
+                try:
+                    node = re.search(node_regex, fname, re.IGNORECASE).group(1)
+
+                except Exception as e:
+                    print(
+                        "\n\nCouldn't set node with node_regex on '{}':\n\t{}".format(
+                            fname, e
+                        )
+                    )
+                    node = fname
+
+                df = pd.read_csv(fname, sep="\t", header=0, dtype=str)
+
+                if df.empty:
+                    print("\t\t'{}' TSV is empty. No data to summarize.\n".format(node))
+
+                else:
+                    nn_nodes.append(node)
+                    prop_regex = re.compile(
+                        r"^[A-Za-z0-9_]*[^.]$"
+                    )  # drop the links, e.g., cases.submitter_id or diagnoses.id (matches all properties with no ".")
+                    props = list(
+                        filter(prop_regex.match, list(df))
+                    )  # properties in this TSV to summarize
+                    props = [
+                        prop for prop in props if prop not in omit_props
+                    ]  # omit_props=['project_id','type','id','submitter_id','case_submitter_id','case_ids','visit_id','sample_id','md5sum','file_name','object_id']
+
+                    # msg = "\t\tTotal of {} records in '{}' TSV with {} properties.".format(len(df),node,len(props))
+                    # sys.stdout.write("\r"+str(msg))
+
+                    for prop in props:  # prop=props[0]
+
+                        prop_name = "{}.{}".format(node, prop)
+                        prop_id = "{}.{}".format(project_id, prop_name)
+                        print(prop_name)
+
+                        # because of sheepdog bug, need to inclue "None" in "null" (:facepalm:) https://ctds-planx.atlassian.net/browse/PXP-5663
+                        # df.at[df[prop] == "None", prop] = np.nan
+
+                        null = df.loc[df[prop].isnull()]
+                        nn = df.loc[df[prop].notnull()]
+                        perc_null = len(null) / len(df)
+                        ptype = self.get_prop_type(node, prop, dd)
+
+                        # dict for the prop's row in report dataframe
+                        prop_stats = {
+                            "prop_id": prop_id,
+                            "project_id": project_id,
+                            "node": node,
+                            "property": prop,
+                            "type": ptype,
+                            "N": len(df),
+                            "nn": len(nn),
+                            "null": len(null),
+                            "perc_null": perc_null,
+                            "all_null": np.nan,
+                            "min": np.nan,
+                            "max": np.nan,
+                            "median": np.nan,
+                            "mean": np.nan,
+                            "stdev": np.nan,
+                            "outliers": np.nan,
+                            "bin_number": np.nan,
+                            "bins": np.nan,
+                        }
+
+                        if nn.empty:
+                            null_props.append(prop_name)
+                            prop_stats["all_null"] = True
+
+                        else:
+                            nn_props.append(prop_name)
+                            all_prop_ids.append(prop_id)
+                            prop_stats["all_null"] = False
+
+                            msg = "\t'{}'".format(prop_id)
+                            sys.stdout.write("\r" + str(msg).ljust(200, " "))
+
+                            if ptype in ["string", "enum", "array", "boolean", "date"]:
+
+                                if ptype == "array":
+
+                                    all_bins = list(nn[prop])
+                                    bin_list = [
+                                        bin_txt.split(",") for bin_txt in list(nn[prop])
+                                    ]
+                                    counts = Counter(
+                                        [
+                                            item
+                                            for sublist in bin_list
+                                            for item in sublist
+                                        ]
+                                    )
+
+                                elif ptype in ["string", "enum", "boolean", "date"]:
+
+                                    counts = Counter(nn[prop])
+
+                                df1 = pd.DataFrame.from_dict(
+                                    counts, orient="index"
+                                ).reset_index()
+                                bins = [tuple(x) for x in df1.values]
+                                bins = sorted(
+                                    sorted(bins, key=lambda x: (x[0])),
+                                    key=lambda x: (x[1]),
+                                    reverse=True,
+                                )  # sort first by name, then by value. This way, names with same value are in same order.
+
+                                prop_stats["bins"] = bins
+                                prop_stats["bin_number"] = len(bins)
+
+                            # Get stats for numbers
+                            elif ptype in ["number", "integer"]:  # prop='concentration'
+
+                                # make a list of the data values as floats (converted from strings)
+                                nn_all = nn[prop]
+                                d_all = list(nn_all)
+
+                                nn_num = (
+                                    nn[prop]
+                                    .apply(pd.to_numeric, errors="coerce")
+                                    .dropna()
+                                )
+                                d = list(nn_num)
+
+                                nn_string = nn.loc[~nn[prop].isin(list(map(str, d)))]
+                                non_numbers = list(nn_string[prop])
+
+                                if (
+                                    len(d) > 0
+                                ):  # if there are numbers in the data, calculate numeric stats
+
+                                    # calculate summary stats using the float list d
+                                    mean = statistics.mean(d)
+                                    median = statistics.median(d)
+                                    minimum = min(d)
+                                    maximum = max(d)
+
+                                    if (
+                                        len(d) == 1
+                                    ):  # if only one value, no stdev and no outliers
+                                        std = "NA"
+                                        outliers = []
+                                    else:
+                                        std = statistics.stdev(d)
+                                        # Get outliers by mean +/- outlier_threshold * stdev
+                                        cutoff = (
+                                            std * outlier_threshold
+                                        )  # three times the standard deviation is default
+                                        lower, upper = (
+                                            mean - cutoff,
+                                            mean + cutoff,
+                                        )  # cut-offs for outliers is 3 times the stdev below and above the mean
+                                        outliers = sorted(
+                                            list(
+                                                set(
+                                                    [
+                                                        x
+                                                        for x in d
+                                                        if x < lower or x > upper
+                                                    ]
+                                                )
+                                            )
+                                        )
+
+                                    # if property type is 'integer', change min, max, median to int type
+                                    if ptype == "integer":
+                                        median = int(median)  # median
+                                        minimum = int(minimum)  # min
+                                        maximum = int(maximum)  # max
+                                        outliers = [
+                                            int(i) for i in outliers
+                                        ]  # convert outliers from float to int
+
+                                    prop_stats["stdev"] = std
+                                    prop_stats["mean"] = mean
+                                    prop_stats["median"] = median
+                                    prop_stats["min"] = minimum
+                                    prop_stats["max"] = maximum
+                                    prop_stats["outliers"] = outliers
+
+                                # check if numeric property is mixed with strings, and if so, summarize the string data
+                                if len(d_all) > len(d):
+
+                                    msg = "\t\tFound {} string values among the {} records of prop '{}' with value(s): {}. Calculating stats only for the {} numeric values.".format(
+                                        len(non_numbers),
+                                        len(nn),
+                                        prop,
+                                        list(set(non_numbers)),
+                                        len(d),
+                                    )
+                                    print("\n\t{}\n".format(msg))
+
+                                    prop_stats["type"] = "mixed {},string".format(ptype)
+
+                                    counts = Counter(nn_string[prop])
+                                    df1 = pd.DataFrame.from_dict(
+                                        counts, orient="index"
+                                    ).reset_index()
+                                    bins = [tuple(x) for x in df1.values]
+                                    bins = sorted(
+                                        sorted(bins, key=lambda x: (x[0])),
+                                        key=lambda x: (x[1]),
+                                        reverse=True,
+                                    )
+                                    prop_stats["bins"] = bins
+                                    prop_stats["bin_number"] = len(bins)
+
+                            else:  # If its not in the list of ptypes, exit. Need to add array handling.
+                                print(
+                                    "\t\t\n\n\n\nUnhandled property type!\n\n '{}': {}\n\n\n\n".format(
+                                        prop_id, ptype
+                                    )
+                                )
+                                exit()
+
+                        if bin_limit and isinstance(
+                            prop_stats["bins"], list
+                        ):  # if bin_limit != False
+                            prop_stats["bins"] = prop_stats["bins"][: int(bin_limit)]
+
+                        # report = report.append(prop_stats, ignore_index=True)
+                        # print("\n{}\n".format(report))
+                        # print("\n{}\n".format(prop_stats))
+                        pdf = pd.DataFrame.from_records([prop_stats])
+                        pdf["all_null"] = pdf["all_null"].astype(bool)
+                        report = pd.concat([report, pdf])
+
+        if not report_null:  # if report_null == False
+            report = report.loc[report["all_null"] != True]
+
+        # strip the col names so we can sort the report
+        report.columns = report.columns.str.strip()
+        report.sort_values(by=["all_null", "node", "property"], inplace=True)
+
+        summary["report"] = report
+        summary["all_prop_ids"] = all_prop_ids
+
+        # summarize all properties
+        nn_props = sorted(list(set(nn_props)))
+        summary["nn_props"] = nn_props
+
+        null_props = [prop for prop in null_props if prop not in nn_props]
+        summary["null_props"] = sorted(list(set(null_props)))
+
+        # summarize all nodes
+        nn_nodes = sorted(list(set(nn_nodes)))
+        summary["nn_nodes"] = nn_nodes
+
+        dd_regex = re.compile(r"[^_][A-Za-z0-9_]+")
+        dd_nodes = list(filter(dd_regex.match, list(dd)))
+        dd_nodes = [node for node in dd_nodes if node not in omit_nodes]
+        null_nodes = [node for node in dd_nodes if node not in nn_nodes]
+
+        summary["null_nodes"] = null_nodes
+
+        if write_report:  # write_report == True
+
+            self.create_output_dir(outdir=outdir)
+
+            if "/" in tsv_dir:
+                names = tsv_dir.split("/")
+                names = [name for name in names if name != ""]
+                name = names[-1]
+            else:
+                name = tsv_dir
+
+            outname = "data_summary_{}.tsv".format(name)
+            outname = "{}/{}".format(
+                outdir, outname
+            )  # ./data_summary_prod_tsvs_04272020.tsv
+
+            report.to_csv(outname, sep="\t", index=False, encoding="utf-8")
+            sys.stdout.write("\rReport written to file:".ljust(200, " "))
+            print("\n\t{}".format(outname))
+
+        return summary
+
+    def compare_commons(
+        self,
+        reports,
+        stats=[
+            "type",
+            "all_null",
+            "N",
+            "null",
+            "nn",
+            "min",
+            "max",
+            "mean",
+            "median",
+            "stdev",
+            "bin_number",
+            "bins",
+            "outliers",
+        ],
+        write_report=True,
+        outdir=".",
+    ):
+        """Takes two data summary reports (output of "self.write_commons_report" func), and compares the data in each.
+            Comparisons are between matching project/node/property combos (aka "prop_id") in each report.
+        Args:
+            reports(dict): a dict of two "commons_name" : "report", where report is a pandas dataframe generated from a summary of TSV data; obtained by running write_summary_report() on the result of summarize_tsv_data().
+            stats(list): the list of statistics to compare between data commons for each node/property combination
+            outdir(str): directory name to save output files to, defaults to the current working dir
+            write_report(boolean): If True, reports are written to files in the outdir.
+
+        Example:
+            reports = {"prod": report_0, "prep": report_1}
+            c = compare_commons(reports)
+        """
+
+        dc0, dc1 = list(reports)
+        r0 = copy.deepcopy(reports[dc0])
+        r1 = copy.deepcopy(reports[dc1])
+
+        r0.insert(loc=0, column="commons", value=dc0)
+        r1.insert(loc=0, column="commons", value=dc1)
+        report = pd.concat([r0, r1], ignore_index=True, sort=False)
+
+        cols = list(report)
+        p0 = list(r0["prop_id"])
+        p1 = list(r1["prop_id"])
+        prop_ids = sorted(list(set(p0 + p1)))
+        total = len(prop_ids)
+
+        dcs_stats = []
+        for stat in stats:
+            for dc in list(reports):
+                dcs_stats.append(dc + "_" + stat)
+
+        common_cols = [col for col in cols if col not in stats + ["commons"]]
+        comparison_cols = ["comparison"] + common_cols + dcs_stats
+        comparison = pd.DataFrame(columns=comparison_cols, index=prop_ids)
+
+        prop_count = 1
+
+        for prop_id in prop_ids:
+
+            msg = "Comparing stats ({} of {}): '{}'".format(prop_count, total, prop_id)
+            sys.stdout.write("\r" + str(msg).ljust(200, " "))
+
+            prop_count += 1
+
+            project_id, node, prop = prop_id.split(".")
+
+            comparison["prop_id"][prop_id] = prop_id
+            comparison["project_id"][prop_id] = project_id
+            comparison["node"][prop_id] = node
+            comparison["property"][prop_id] = prop
+
+            df = report.loc[report["prop_id"] == prop_id].reset_index(drop=True)
+
+            if len(df) == 1:
+                comparison["comparison"][prop_id] = "unique"
+                dc = df["commons"][0]
+                for stat in stats:
+                    col = "{}_{}".format(dc, stat)
+                    comparison[col][prop_id] = df[stat][0]
+
+            elif (
+                len(df) == 2
+            ):  # just a check that there should be 2 rows in the df (comparing prop_id stats between two different commons)
+                same = []
+                for (
+                    stat
+                ) in (
+                    stats
+                ):  # first, check whether any of the stats are different bw commons
+
+                    col0 = dc0 + "_" + stat  # column name for first commons
+                    col1 = dc1 + "_" + stat  # column name for second commons
+                    comparison[col0][prop_id] = df.loc[df["commons"] == dc0].iloc[0][
+                        stat
+                    ]
+                    comparison[col1][prop_id] = df.loc[df["commons"] == dc1].iloc[0][
+                        stat
+                    ]
+
+                    if (
+                        df[stat][0] != df[stat][1]
+                    ):  # Note: if both values are "NaN" this == True; because NaN != NaN
+                        if (
+                            list(df[stat].isna())[0] == True
+                            and list(df[stat].isna())[1] == True
+                        ):  # if stats are both "NaN", data are identical
+                            same.append(True)
+                        else:  # if stats are different AND both values aren't "NaN", data are different
+                            same.append(False)
+                    else:  # if stat0 is stat1, data are identical
+                        same.append(True)
+
+                if (
+                    False in same
+                ):  # if any of the stats are different bw commons, tag as 'different', otherwise tagged as 'identical'
+                    comparison["comparison"][prop_id] = "different"
+                else:
+                    comparison["comparison"][prop_id] = "identical"
+
+            else:
+                print(
+                    "\n\nThe number of instances of this prop_id '{}' != 2!\n{}\n\n".format(
+                        prop_id, df
+                    )
+                )
+                return df
+
+        # check total
+        identical = comparison.loc[comparison["comparison"] == "identical"]
+        different = comparison.loc[comparison["comparison"] == "different"]
+        unique = comparison.loc[comparison["comparison"] == "unique"]
+
+        if len(prop_ids) == len(identical) + len(different) + len(unique):
+            msg = "All {} prop_ids in the reports were classified as having unique, identical or different data between data commons: {}.\n".format(
+                len(prop_ids), list(reports)
+            )
+            sys.stdout.write("\r" + str(msg).ljust(200, " "))
+
+        else:
+            print("\nSome properties in the report were not classified!")
+
+        # strip the col names so we can sort the report
+        comparison.columns = comparison.columns.str.strip()
+        comparison.sort_values(by=["comparison", "node", "property"], inplace=True)
+
+        if write_report == True:
+
+            self.create_output_dir(outdir)
+
+            outname = "{}/comparison_{}_{}.tsv".format(outdir, dc0, dc1)
+            comparison.to_csv(outname, sep="\t", index=False, encoding="utf-8")
+
+            msg = "Comparison report written to file: {}".format(outname)
+            print(msg)
+
+        return comparison
+
+    def get_token(self):
+        with open(self._auth_provider._refresh_file, "r") as f:
+            creds = json.load(f)
+        token_url = "{}/user/credentials/api/access_token".format(self._endpoint)
+        token = requests.post(token_url, json=creds).json()["access_token"]
+        return token
+
+    # Guppy funcs
+    def guppy_query(self, node, props):
+
+        guppy_url = "{}/guppy/graphql".format(self._endpoint)
+
+        query = "{{ {} {{ {} }} }}".format(node, " ".join(props))
+
+        query_json = {"query": query, "variables": None}
+
+        print("Requesting '{}': {}".format(guppy_url, query_json))
+
+        response = requests.post(guppy_url, json=query_json, auth=self._auth_provider)
+
+        try:
+            data = json.loads(response.text)
+            return data
+        except:
+            print("Error querying Guppy")
+            return response.text
+
+    def guppy_aggregation(self, node, prop, format="JSON"):
+
+        guppy_url = "{}/guppy/graphql".format(self._endpoint)
+
+        query = "{{_aggregation {{{} {{{} {{histogram {{key count}} }}}}}}}}".format(
+            node, prop
+        )
+
+        query_json = {"query": query, "variables": None}
+
+        print("Requesting '{}': {}".format(guppy_url, query_json))
+
+        response = requests.post(guppy_url, json=query_json, auth=self._auth_provider)
+
+        try:
+            res = json.loads(response.text)
+        except:
+            print("Error querying Guppy")
+            return response.text
+        d = res["data"]["_aggregation"][node][prop]["histogram"]
+        if format == "JSON":
+            return d
+        elif format == "TSV":
+            df = pd.json_normalize(d)
+            return df
+
+    def guppy_query_simple(self, query_txt, format="JSON"):
+
+        guppy_url = "{}/guppy/graphql".format(self._endpoint)
+        query_json = {"query": query_txt}
+        print("Requesting '{}': {}".format(guppy_url, query_json))
+        response = requests.post(guppy_url, json=query_json, auth=self._auth_provider)
+        try:
+            res = json.loads(response.text)
+        except:
+            print("Error querying Guppy")
+            return response.text
+        if "data" in response:
+            d = res["data"]["_aggregation"][node][prop]["histogram"]
+            if format == "JSON":
+                return d
+            elif format == "TSV":
+                df = pd.json_normalize(d)
+                return df
+        else:
+            print(response.text)
+
+    # Guppy funcs
+    def guppy_download(self, node, props):
+
+        guppy_dl = "{}guppy/download".format(self._endpoint)
+
+        query_dl = "{{ 'type':'{0}' {{ 'fields': {1} }} }}".format(node, props)
+
+        json_dl = {"query": query_dl, "variables": None}
+
+        print("Requesting '{}': {}".format(guppy_dl, query_dl))
+
+        headers = {"Authorization": "bearer " + self.get_token()}
+
+        # response = requests.post(guppy_dl, json=json_dl, auth=self._auth_provider)
+        response = requests.post(guppy_dl, json=query, headers=headers)
+
+        try:
+            data = json.loads(response.text)
+            return data
+        except:
+            print("Error querying Guppy")
+            return response.text
+
+    def write_manifest(self, guids, filename="gen3_manifest.json"):
+
+        with open(filename, "w") as mani:
+
+            mani.write("[\n  {\n")
+
+            count = 0
+            for guid in guids:
+                count += 1
+                file_line = '    "object_id": "{}"\n'.format(guid)
+                mani.write(file_line)
+                if count == len(guids):
+                    mani.write("  }]")
+                else:
+                    mani.write("  },\n  {\n")
+
+        print("\tDone ({}/{}).".format(count, len(guids)))
+        print("\tManifest written to file: {}".format(filename))
+        return filename
+
+    def list_nodes(
+        self,
+        excluded_schemas=[
+            "_definitions",
+            "_settings",
+            "_terms",
+            "program",
+            "project",
+            "root",
+            "data_release",
+            "metaschema",
+        ],
+    ):
+        """
+        This function gets a data dictionary, and then it determines the submission order of nodes by looking at the links.
+        The reverse of this is the deletion order for deleting projects. (Must delete child nodes before parents).
+        """
+        dd = self.sub.get_dictionary_all()
+        schemas = list(dd)
+        nodes = [k for k in schemas if k not in excluded_schemas]
+        return nodes
+
+    def query_subject_ids(self, subject_id, nodes=None):
+        """
+        This function takes the submitter_id of a case or subject and checks for records in specified node(s) for matching value in the case_ids or subject_ids ubiquitous property.
+        """
+        if nodes == None:
+            nodes = self.list_nodes()
+        elif isinstance(nodes, str):
+            nodes = [nodes]
+
+        if "case" in nodes:
+            subject_node, subject_prop = "case", "case_ids"
+        else:
+            subject_node, subject_prop = "subject", "subject_ids"
+
+        # if projects == None: #if no projects specified, get node for all projects
+        #     projects = list(json_normalize(self.sub.query("""{project (first:0){project_id}}""")['data']['project'])['project_id'])
+        # elif isinstance(projects, str):
+        #     projects = [projects]
+
+        query_args = '{}:"{}"'.format(subject_prop, subject_id)
+        results = {}
+        for node in nodes:
+            res = self.paginate_query(
+                node=node, props=["project_id", "id", "submitter_id"], args=query_args
+            )
+            if len(res["data"][node]) > 0:
+                results[node] = res["data"][node]
+
+        data = {}
+        for node in list(results):
+            # uuids = [rec['id'] for rec in results[node]]
+            dfs = []
+            for rec in results[node]:
+                project_id = rec["project_id"]
+                uuid = rec["id"]
+                program, project = project_id.split("-", 1)
+                rec = self.sub.export_record(
+                    program=program,
+                    project=project,
+                    uuid=uuid,
+                    fileformat="tsv",
+                    filename=None,
+                )
+                # str_list = rec.split('\r\n')
+                # headers = str_list[0].split('\t')
+                # data = str_list[1].split('\t')
+                # df = pd.DataFrame(data,columns=headers)
+                dfs.append(pd.read_csv(StringIO(rec), sep="\t", header=0))
+            df = pd.concat(dfs, ignore_index=True, sort=False)
+            data[node] = df
+
+        return data
+
+        # visits = list(set([item for sublist in [list(set(list(df['visit_id']))) for df in data.values()] for item in sublist if not pd.isnull(item)]))
+
+    def query_visit_ids(self, visit_ids):
+        """
+        This function takes visit submitter_ids and returns the visit records.
+        You can extract the visit submitter_ids from the data returned from a query_subject_ids function using the following one-liner:
+            visits = list(set([item for sublist in [list(set(list(df['visit_id']))) for df in data.values()] for item in sublist if not pd.isnull(item)]))
+        """
+        if isinstance(visit_ids, str):
+            visit_ids = [visit_ids]
+        if isinstance(visit_ids, list):
+            visit_ids = list(set(visit_ids))
+        else:
+            print("Please provide one or more visit_ids!")
+            return
+
+        dfs, visit_uuids = [], []
+        for visit_id in visit_ids:
+            query_args = 'submitter_id:"{}"'.format(visit_id)
+            res = self.paginate_query(
+                node="visit", props=["project_id", "id"], args=query_args
+            )
+            if len(res["data"]["visit"]) > 0:
+                uuid = res["data"]["visit"][0]["id"]
+                project_id = res["data"]["visit"][0]["project_id"]
+                program, project = project_id.split("-", 1)
+                rec = self.sub.export_record(
+                    program=program,
+                    project=project,
+                    uuid=uuid,
+                    fileformat="tsv",
+                    filename=None,
+                )
+                dfs.append(pd.read_csv(StringIO(rec), sep="\t", header=0))
+        df = pd.concat(dfs, ignore_index=True, sort=False)
+
+        return df
+
+    def get_mds(self, data=True, limit=1000, args=None, guids=None, save=True):
+        """
+        Gets all the data in the metadata service for a data commons environment.
+        Set data=False to get only the "guids" of the metadata entries.
+        """
+
+        if guids is None:
+            if args is None:
+                murl = "{}/mds/metadata?limit={}".format(self._endpoint, limit)
+            else:
+                murl = "{}/mds/metadata?limit={}&{}".format(self._endpoint, limit, args)
+
+            if data is True:
+                murl += "&data=True"
+
+            print("Fetching metadata from URL: \n\t{}".format(murl))
+            try:
+                response = requests.get(murl)
+                md = json.loads(response.text)
+
+            except Exception as e:
+                print(
+                    "\tUnable to parse MDS response as JSON!\n\t\t{} {}".format(
+                        type(e), e
+                    )
+                )
+                md = response.text
+
+        else:
+            if isinstance(guids, str):
+                murl = "{}/mds/metadata/{}".format(self._endpoint, guids)
+                print("Fetching metadata from URL: \n\t{}".format(murl))
+
+                response = requests.get(murl)
+                d = json.loads(response.text)
+                md = {guids: d}
+
+            elif isinstance(guids, list):
+                md = []
+                for guid in guids:
+                    murl = "{}/mds/metadata/{}".format(self._endpoint, guid)
+                    print("Fetching metadata from URL: \n\t{}".format(murl))
+                    response = requests.get(murl)
+                    md.append(json.loads(response.text))
+
+        if save == True:
+            now = datetime.datetime.now()
+            date = "{}-{}-{}-{}.{}.{}".format(
+                now.year, now.month, now.day, now.hour, now.minute, now.second
+            )
+            filename = "MDS_{}.json".format(date)
+
+            with open(filename, "w") as fp:
+                json.dump(md, fp)
+
+        return md
+
+    def get_aggmds(self, data=True, limit=1000, args=None, guids=None, save=False):
+        """
+        Gets all the AggMDS data in the metadata service for a data commons environment.
+        Set data=False to get only the "guids" of the metadata entries.
+        """
+
+        if guids is None:
+            if args is None:
+                murl = "{}/mds/aggregate/metadata?limit={}".format(
+                    self._endpoint, limit
+                )
+            else:
+                murl = "{}/mds/aggregate/metadata?limit={}&{}".format(
+                    self._endpoint, limit, args
+                )
+
+            if data is True:
+                murl += "&data=True"
+
+            print("Fetching metadata from URL: \n\t{}".format(murl))
+            try:
+                response = requests.get(murl)
+                md = json.loads(response.text)
+
+            except Exception as e:
+                print(
+                    "\tUnable to parse MDS response as JSON!\n\t\t{} {}".format(
+                        type(e), e
+                    )
+                )
+                md = response.text
+
+        else:
+            if isinstance(guids, str):
+                murl = "{}/mds/aggregate/metadata/{}".format(self._endpoint, guids)
+                print("Fetching metadata from URL: \n\t{}".format(murl))
+
+                response = requests.get(murl)
+                d = json.loads(response.text)
+                md = {guids: d}
+
+            elif isinstance(guids, list):
+                md = []
+                for guid in guids:
+                    murl = "{}/mds/aggregate/metadata/{}".format(self._endpoint, guid)
+                    print("Fetching metadata from URL: \n\t{}".format(murl))
+                    response = requests.get(murl)
+                    md.append(json.loads(response.text))
+
+        if save == True:
+            now = datetime.datetime.now()
+            date = "{}-{}-{}-{}.{}.{}".format(
+                now.year, now.month, now.day, now.hour, now.minute, now.second
+            )
+            filename = "MDS_{}.json".format(date)
+
+            with open(filename, "w") as fp:
+                json.dump(md, fp)
+
+        return md
+
+    def get_aggmds_all(self, limit=1000, offset=0, save=True):
+        """
+        Gets all the data in the metadata service for a data commons environment.
+        """
+        m, done = {}, False
+        while done is False:
+            murl = "{}/mds/aggregate/metadata?data=True&limit={}&offset={}".format(
+                self._endpoint, limit, offset
+            )
+            print("Fetching AggMDS records for: {}".format(murl))
+            try:
+                response = requests.get(murl)
+                mds = json.loads(response.text)
+                if len(mds) == 0:
+                    done = True
+                else:
+                    m.update(mds)
+
+            except Exception as e:
+                print(
+                    "\tUnable to parse AggMDS response as JSON!\n\t\t{} {}".format(
+                        type(e), e
+                    )
+                )
+                md = response.text
+            offset += limit
+
+        if save == True:
+            now = datetime.datetime.now()
+            date = "{}-{}-{}-{}.{}.{}".format(
+                now.year, now.month, now.day, now.hour, now.minute, now.second
+            )
+            filename = "AggMDS_{}_{}.json".format(len(m), date)
+
+            with open(filename, "w") as json_file:
+                json.dump(m, json_file)
+            print(
+                "Finished. {} MDS records written to file:\n\t{}".format(
+                    len(m), filename
+                )
+            )
+        return m
+
+    def get_mds_all(self, limit=1000, offset=0, save=True):
+        """
+        Gets all the data in the metadata service for a data commons environment.
+        """
+        m, done = {}, False
+        while done is False:
+            murl = "{}/mds/metadata?data=True&limit={}&offset={}".format(
+                self._endpoint, limit, offset
+            )
+            print("Fetching records for: {}".format(murl))
+            try:
+                response = requests.get(murl)
+                mds = json.loads(response.text)
+                if len(mds) == 0:
+                    done = True
+                else:
+                    m.update(mds)
+
+            except Exception as e:
+                print(
+                    "\tUnable to parse MDS response as JSON!\n\t\t{} {}".format(
+                        type(e), e
+                    )
+                )
+                md = response.text
+            offset += limit
+
+        if save == True:
+            now = datetime.datetime.now()
+            date = "{}-{}-{}-{}.{}.{}".format(
+                now.year, now.month, now.day, now.hour, now.minute, now.second
+            )
+            filename = "MDS_{}_{}.json".format(len(m), date)
+
+            with open(filename, "w") as json_file:
+                json.dump(m, json_file)
+            print(
+                "Finished. {} MDS records written to file:\n\t{}".format(
+                    len(m), filename
+                )
+            )
+        return m
+
+    def get_mds_all_args(self, args=None, limit=1000, offset=0, save=True):
+        """
+        Gets all the data in the metadata service for a data commons environment.
+        """
+        m, done = {}, False
+        while done is False:
+            murl = "{}/mds/metadata?data=True&limit={}&offset={}&{}".format(
+                self._endpoint, limit, offset, args
+            )
+            print("Fetching records for: {}".format(murl))
+            try:
+                response = requests.get(murl)
+                mds = json.loads(response.text)
+                if len(mds) == 0:
+                    done = True
+                else:
+                    m.update(mds)
+
+            except Exception as e:
+                print(
+                    "\tUnable to parse MDS response as JSON!\n\t\t{} {}".format(
+                        type(e), e
+                    )
+                )
+                md = response.text
+            offset += limit
+
+        if save == True:
+            now = datetime.datetime.now()
+            date = "{}-{}-{}-{}.{}.{}".format(
+                now.year, now.month, now.day, now.hour, now.minute, now.second
+            )
+            filename = "MDS_{}_{}.json".format(len(m), date)
+
+            with open(filename, "w") as json_file:
+                json.dump(m, json_file)
+            print(
+                "Finished. {} MDS records written to file:\n\t{}".format(
+                    len(m), filename
+                )
+            )
+        return m
+
+    def delete_mds(self, guids):
+        """ """
+        deleted, failed = [], []
+        if isinstance(guids, str):
+            guids = [guids]
+        if not isinstance(guids, list):
+            print("\n\tPlease submit GUIDs as a list.")
+
+        count = 0
+        total = len(guids)
+        for guid in guids:
+            count += 1
+            mds_api = "{}/mds/metadata/{}".format(self._endpoint, guid)
+            res = requests.delete(mds_api, auth=self._auth_provider)
+            print(res.text)
+
+            if res.status_code == 200:
+                deleted.append(guid)
+                print("({}/{}) Deleted '{}' from MDS.".format(count, total, guid))
+            else:
+                failed.append(guid)
+                print(
+                    "({}/{}) FAILED to delete '{}' from MDS.".format(count, total, guid)
+                )
+        if len(deleted) > 0:
+            print("Successfully deleted {} metadata records.".format(len(deleted)))
+            self.nuked()
+        return {"deleted": deleted, "failed": failed}
+
+    def submit_mds(self, mds):
+        """
+        Submit metadata to the metadata service (MDS) API.
+        """
+        submitted, failed = [], []
+        guids = list(mds)
+        total = len(guids)
+        count = 0
+        for guid in guids:
+            count += 1
+            print("\n\tPosting '{}' to metadata service".format(guid))
+            mds_api = "{}/mds/metadata/{}".format(self._endpoint, guid)
+            res = requests.post(mds_api, json=mds[guid], auth=self._auth_provider)
+
+            if res.status_code > 199 and res.status_code < 300:
+                submitted.append(guid)
+                print("({}/{}) Submitted '{}' to MDS.".format(count, total, guid))
+            else:
+                failed.append(guid)
+                print(
+                    "({}/{}) FAILED to submit '{}' to MDS.".format(count, total, guid)
+                )
+                print("\n\t\t{}".format(res.text))
+
+        return {"submitted": submitted, "failed": failed}
+
+    def update_mds(self, mds):
+        """
+        Submit metadata to the metadata service (MDS) API.
+        https://petstore.swagger.io/?url=https://raw.githubusercontent.com/uc-cdis/metadata-service/master/docs/openapi.yaml#/Maintain/update_metadata_metadata__guid__put
+        """
+        submitted, failed = [], []
+        guids = list(mds)
+        total = len(guids)
+        count = 0
+        for guid in guids:
+            count += 1
+            print("\n\tPosting '{}' to metadata service".format(guid))
+            mds_api = "{}/mds/metadata/{}".format(self._endpoint, guid)
+            res = requests.put(mds_api, json=mds[guid], auth=self._auth_provider)
+
+            if res.status_code > 199 and res.status_code < 300:
+                submitted.append(guid)
+                print("({}/{}) Submitted '{}' to MDS.".format(count, total, guid))
+            else:
+                failed.append(guid)
+                print(
+                    "({}/{}) FAILED to submit '{}' to MDS.".format(count, total, guid)
+                )
+                print("\n\t\t{}".format(res.text))
+
+        return {"submitted": submitted, "failed": failed}
+
+    ####################################################################################
+    ### Functions for MIDRC Pre-ingestion QC of new batches received from data submitters
+    ####################################################################################
+    def sort_batch_tsvs(self, batch, batch_dir):
+        """
+        Sorts the TSVs provided by a MIDRC data submitter into manifests and node submission TSVs.
+
+        Args:
+            batch(str): the name of the batch, e.g., "RSNA_20230303"
+            batch_dir(str): the full path of the local directory where the batch TSVs are located.
+        """
+        tsvs = []
+        for file in os.listdir(batch_dir):
+            if file.endswith(".tsv"):
+                tsvs.append(os.path.join(batch_dir, file))
+
+        nodes = self.get_submission_order()
+        nodes = [i[0] for i in nodes]
+
+        node_tsvs = {}
+        clinical_manifests, image_manifests = [], []
+        other_tsvs, nomatch_tsvs = [], []
+        node_regex = r".*/(\w+)_{}\.tsv".format(batch)
+
+        for tsv in tsvs:
+            print(tsv)
+            if "manifest" in tsv:
+                if "clinical" in tsv:
+                    clinical_manifests.append(tsv)
+                elif "image" in tsv or "imaging" in tsv:
+                    image_manifests.append(tsv)
+            else:
+                match = re.findall(node_regex, tsv, re.M)
+                print(match)
+
+                if not match:
+                    nomatch_tsvs.append(tsv)
+                else:
+                    node = match[0]
+                    if node in nodes:
+                        # node_tsvs.append({node:tsv})
+                        node_tsvs[node] = tsv
+                    elif node + "_file" in nodes:
+                        # node_tsvs.append({"{}_file".format(node):tsv})
+                        node_tsvs["{}_file".format(node)] = tsv
+                    else:
+                        other_tsvs.append({node: tsv})
+        batch_tsvs = {
+            "batch": batch,
+            "node_tsvs": node_tsvs,
+            "image_manifests": image_manifests,
+            "clinical_manifests": clinical_manifests,
+            "other_tsvs": other_tsvs,
+            "nomatch_tsvs": nomatch_tsvs,
+        }
+        return batch_tsvs
+
+    def check_case_ids(self, df, node, cids):
+        """
+        Check that all case IDs referenced across dataset are in case TSV; "cids" = "case_ids"
+
+        Args:
+            df(pandas DataFrame): the DataFrame of a node submission TSV read into pandas
+            node(str): the name of the node (node ID) being checked
+            cids(list): the list of case IDs provided in the batch case TSV
+        """
+        errors = []
+        extra_cids = []
+        if node != "case":
+            if "case_ids" in df:
+                df_cids = list(set(df["case_ids"]))
+            elif "cases.submitter_id" in df:
+                df_cids = list(set(df["cases.submitter_id"]))
+            else:
+                error = "Didn't find any case IDs in the {} TSV!".format(node)
+                print(error)
+                errors.append(error)
+                df_cids = []
+            # print("Found {} case IDs in the {} TSV.".format(len(cids),node_id))
+            extra_cids = list(set(df_cids).difference(cids))
+
+            if len(extra_cids) > 0:
+                error = "{} TSV contains {} case IDs that are not present in the case TSV!\n\t{}\n\n".format(
+                    node, len(extra_cids), extra_cids
+                )
+                print(error)
+                errors.append(error)
+
+        return errors
+
+    def check_type_field(self, df, node):
+        """
+        Check that the type of all values for properties in a node submission TSV match the data dictionary type
+
+        Args:
+            df(pandas DataFrame): the DataFrame of a node submission TSV read into pandas
+            node(str): the name of the node (node ID) being checked
+        """
+        errors = []
+        if not "type" in df:
+            error = "{} TSV does not have 'type' header!".format(node)
+            print(error)
+            errors.append(error)
+        else:
+            if not list(set(df.type))[0] == node:
+                error = "{} TSV does not have correct 'type' field.".format(node)
+                print(error)
+                errors.append(error)
+        return errors
+
+    def check_submitter_id(self, df, node):
+        """
+        Check that the submitter_id column is complete and doesn't contain duplicates.
+        "sids" is short for "submitter_ids".
+
+        Args:
+            df(pandas DataFrame): the DataFrame of a node submission TSV read into pandas
+            node(str): the name of the node (node ID) being checked
+        """
+        errors = []
+        if not "submitter_id" in df:
+            error = "{} TSV does not have 'submitter_id' header!".format(node)
+            print(error)
+            errors.append(error)
+        else:
+            sids = list(set(df.submitter_id))
+            if not len(sids) == len(df):
+                error = "{} TSV does not have unique submitter_ids! Submitter_ids: {}, TSV Length: {}".format(
+                    node, len(sids), len(df)
+                )
+                print(error)
+                errors.append(error)
+        return errors
+
+    def check_links(self, df, node, dd):
+        """
+        Check whether link headers are provided in a node submission TSV
+        In many cases, node TSVs simply link to the case node, and submitters just provide the "case_ids" column, but we'll check anyways.
+
+        Args:
+            df(pandas DataFrame): the DataFrame of a node submission TSV read into pandas
+            node(str): the name of the node (node ID) being checked
+            dd(dictionary): the data dictionary being used, get with Gen3Submission.get_dictionary_all()
+        """
+        errors = []
+        links = self.list_links(node, dd)
+        if "core_metadata_collections" in links:
+            links.remove("core_metadata_collections")
+        if "core_metadata_collections.submitter_id" in links:
+            links.remove("core_metadata_collections.submitter_id")
+        for link in links:
+            link_col = "{}.submitter_id".format(link)
+            if link_col not in df:
+                error = "'{}' link header not found in '{}' TSV.".format(link_col, node)
+                print(
+                    error
+                )  # this is not necessarily an error, as some links may be optional, but must have at least 1 link
+                errors.append(error)
+        return errors
+
+    # 4) special characters
+    def check_special_chars(
+        self, node, batch_tsvs
+    ):  # probably need to add more types of special chars to this
+        """
+        Check for special characters that aren't compatible with Gen3's sheepdog submission service.
+
+        Args:
+            node(str): the name of the node (node ID) being checked
+        """
+        errors = []
+        filename = batch_tsvs["node_tsvs"][node]
+        with open(filename, "rb") as tsv_file:
+            lns = tsv_file.readlines()
+            count = 0
+            for ln in lns:
+                count += 1
+                if b"\xe2" in ln:
+                    error = "{} TSV has special char in line {}: {}".format(
+                        node, count, ln
+                    )
+                    print(error)
+                    errors.append(error)
+        return errors
+
+    def check_required_props(
+        self,
+        df,
+        node,
+        dd,
+        exclude_props=[  # submitters don't provide these properties, so remove them from QC check
+            # case props not provided by submitters
+            "datasets.submitter_id",
+            "token_record_id",
+            "linked_external_data",
+            # series_file props not provided by submitters
+            "file_name",
+            "md5sum",
+            "file_size",
+            "object_id",
+            "storage_urls",
+            "core_metadata_collections.submitter_id",
+            "core_metadata_collections",
+            "associated_ids",
+            # imaging_study props not provided by submitters
+            "loinc_code",
+            "loinc_system",
+            "loinc_contrast",
+            "loinc_long_common_name",
+            "loinc_method",
+            "days_from_study_to_neg_covid_test",
+            "days_from_study_to_pos_covid_test",
+        ],
+    ):
+        """
+        Check whether all required properties for a node are provided in the submission TSV.
+
+        Args:
+            df(pandas DataFrame): the DataFrame of a node submission TSV read into pandas
+            node(str): the name of the node (node ID) being checked
+            dd(dictionary): the data dictionary being used, get with Gen3Submission.get_dictionary_all()
+        """
+        errors = []
+        links = self.list_links(node, dd)
+        any_na = df.columns[df.isna().any()].tolist()
+        required_props = list(
+            set(dd[node]["required"]).difference(links).difference(exclude_props)
+        )
+        for prop in required_props:
+            if prop not in df:
+                error = "{} TSV does not have required property header '{}'!".format(
+                    node, prop
+                )
+                print(error)
+                errors.append(error)
+            elif prop in any_na:
+                error = "{} TSV does not have complete data for required property '{}'!".format(
+                    node, prop
+                )
+                print(error)
+                errors.append(error)
+        return errors
+
+    def check_completeness(self, df, node):
+        """
+        Report on whether any properties in column headers have all NA/null values.
+
+        Args:
+            df(pandas DataFrame): the DataFrame of a node submission TSV read into pandas
+            node(str): the name of the node (node ID) being checked
+        """
+        errors = []
+        all_na = df.columns[df.isna().all()].tolist()
+        if len(all_na) > 0:
+            error = "'{}' TSV has all NA values for these properties: {}".format(
+                node, all_na
+            )
+            print(error)
+            errors.append(error)
+        return errors
+
+    # 7) prop types
+    def check_prop_types(
+        self,
+        df,
+        node,
+        dd,
+        exclude_props=[  # submitters don't provide these properties, so remove them from QC check
+            # case props not provided by submitters
+            "datasets.submitter_id",
+            "token_record_id",
+            "linked_external_data",
+            # series_file props not provided by submitters
+            "file_name",
+            "md5sum",
+            "file_size",
+            "object_id",
+            "storage_urls",
+            "core_metadata_collections.submitter_id",
+            "core_metadata_collections",
+            "associated_ids",
+            # imaging_study props not provided by submitters
+            "loinc_code",
+            "loinc_system",
+            "loinc_contrast",
+            "loinc_long_common_name",
+            "loinc_method",
+            "days_from_study_to_neg_covid_test",
+            "days_from_study_to_pos_covid_test",
+        ],
+    ):
+        """
+        Check that the types of properties match their values.
+
+        Args:
+            df(pandas DataFrame): the DataFrame of a node submission TSV read into pandas
+            node(str): the name of the node (node ID) being checked
+            dd(dictionary): the data dictionary being used, get with Gen3Submission.get_dictionary_all()
+        """
+        errors = []
+        all_na = df.columns[df.isna().all()].tolist()
+        links = self.list_links(node, dd)
+        required_props = list(
+            set(dd[node]["required"]).difference(links).difference(exclude_props)
+        )
+        if all_na == None:
+            props = list(
+                set(dd[node]["properties"])
+                .difference(links)
+                .difference(required_props)
+                .difference(dd[node]["systemProperties"])
+                .difference(exclude_props)
+            )
+        else:
+            props = list(
+                set(dd[node]["properties"])
+                .difference(links)
+                .difference(required_props)
+                .difference(dd[node]["systemProperties"])
+                .difference(exclude_props)
+                .difference(all_na)
+            )
+        for prop in props:
+            if prop in df:
+                if "type" in dd[node]["properties"][prop]:
+                    etype = dd[node]["properties"][prop]["type"]  # expected type
+                    if etype == "array":
+                        if "items" in dd[node]["properties"][prop]:
+                            etype = dd[node]["properties"][prop]["items"]
+                            if "type" in dd[node]["properties"][prop]["items"]:
+                                etype = dd[node]["properties"][prop]["items"]["type"]
+
+                    d = df[prop].dropna()
+                    if etype == "integer":
+                        try:
+                            d = d.astype(int)
+                        except Exception as e:
+                            error = "'{}' prop should be integer, but has non-integer values: {}".format(
+                                prop, e
+                            )
+                            print(error)
+                            errors.append(error)
+                    elif etype == "number":
+                        try:
+                            d = d.astype(float)
+                        except Exception as e:
+                            error = "'{}' prop should be integer, but has non-integer values: {}".format(
+                                prop, e
+                            )
+                            print(error)
+                            errors.append(error)
+                            continue  # Skip to the next property if conversion fails
+                    if "minimum" in dd[node]["properties"][prop]:
+                        minimum = dd[node]["properties"][prop]["minimum"]
+                        if (d < minimum).any():
+                            error = (
+                                "'{}' property has values below the minimum: {}".format(
+                                    prop, d[d < minimum]
+                                )
+                            )
+                            print(error)
+                            errors.append(error)
+                    if "maximum" in dd[node]["properties"][prop]:
+                        maximum = dd[node]["properties"][prop]["maximum"]
+                        if (d > maximum).any():
+                            error = (
+                                "'{}' property has values above the maximum: {}".format(
+                                    prop, d[d > maximum]
+                                )
+                            )
+                            print(error)
+                            errors.append(error)
+                    elif etype == "boolean":
+                        vals = list(set(d))
+                        wrong_vals = list(
+                            set(vals).difference(
+                                ["True", "False", "true", "false", "TRUE", "FALSE"]
+                            )
+                        )
+                        if len(wrong_vals) > 0:
+                            error = (
+                                "'{}' property has incorrect boolean values: {}".format(
+                                    prop, wrong_vals
+                                )
+                            )
+                            print(error)
+                            errors.append(error)
+                    else:
+                        d = d.convert_dtypes(
+                            infer_objects=True,
+                            convert_string=True,
+                            convert_integer=True,
+                            convert_boolean=True,
+                            convert_floating=True,
+                        )
+                        # itype = d.dtypes[prop] # inferred type
+                        itype = d.dtype  # inferred type
+                        # if itype == 'Int64':
+                        #     itype = 'integer'
+                        if not etype == itype:
+                            error = "'{}' property has inferred type '{}' and not the expected type: '{}'".format(
+                                prop, itype, etype
+                            )
+                            print(error)
+                            errors.append(error)
+
+                    # to do: Check for min/max of number/int properties
+                    if "minimum" in dd[node]["properties"][prop]:  #
+                        min = dd[node]["properties"][prop]["minimum"]
+                        # for each value of d, are any less than min or greater than max
+
+                elif "enum" in dd[node]["properties"][prop]:
+                    enums = dd[node]["properties"][prop]["enum"]
+                    vals = list(set(df[prop].dropna()))
+                    wrong_vals = list(set(vals).difference(enums))
+                    if len(wrong_vals) > 0:
+                        error = "'{}' property has incorrect enum values: {}".format(
+                            prop, wrong_vals
+                        )
+                        print(error)
+                        errors.append(error)
+
+            else:
+                error = "'{}' property in dictionary is not in the '{}' TSV.".format(
+                    prop, node
+                )
+                print(error)
+                errors.append(error)
+
+        # check that columns in TSV are correctly named and present in data dictionary for that node
+        df_props = list(df)
+        extra_props = list(set(df_props).difference(list(set(dd[node]["properties"]))))
+        for link in links:
+            if link in extra_props:
+                extra_props.remove(link)
+            alt_link = link + ".submitter_id"
+            if alt_link in extra_props:
+                extra_props.remove(alt_link)
+        if len(extra_props) > 0:
+            error = "'{}' properties in the {} TSV not in the data dictionary.".format(
+                extra_props, node
+            )
+            print(error)
+            errors.append(error)
+        errors = list(set(errors))
+        return errors
+
+    def check_dry_submit(self, node):
+        """
+        Attempt to dry submit a node submission TSV
+
+        Args:
+            node(str): the name of the node (node ID) being checked
+        """
+        errors = []
+        if node in batch_tsvs["node_tsvs"]:
+            filename = batch_tsvs["node_tsvs"][node]
+            if not filename:
+                print("Couldn't find the {} TSV!".format(node))
+            else:
+                try:
+                    d = self.submit_file_dry(
+                        project_id=pid, filename=filename, chunk_size=1000
+                    )
+                except Exception as e:
+                    error = "'{}' TSV dry run submission failed: {}".format(node, e)
+                    print(error)
+                    errors.append(error)
+        return errors
+
+    def read_image_manifests(
+        self,
+        image_manifests,
+        cols=[
+            "md5sum",
+            "storage_urls",
+            "file_size",
+            "case_ids",
+            "study_uid",
+            "series_uid",
+            "file_name",
+        ],
+    ):
+        """
+        Reads in and concatenates image manifests if multiple manifests are provided for a batch.
+
+        Args:
+            image_manifests(list): a list of all TSV files matching the format of an image manifest in a batch of TSVs
+            cols(list): the columns required in the image manifest for the packaging script to run properly.
+        """
+        idf = pd.DataFrame(columns=cols)
+        for image_manifest in image_manifests:
+            try:
+                df = pd.read_csv(image_manifest, sep="\t", header=0, dtype=str)
+                df = df[cols]
+                idf = pd.concat([idf, df])
+            except:
+                print("Couldn't read in the image manifests!")
+        return idf
+
+    def check_image_manifest(
+        self,
+        idf,
+        cids,
+        cols=[
+            "md5sum",
+            "storage_urls",
+            "file_size",
+            "case_ids",
+            "study_uid",
+            "series_uid",
+            "file_name",
+        ],
+    ):
+        """
+        Check for missing required columns in an image manifest.
+
+        Args:
+            idf(DataFrame): the master imaging manifest DataFrame; obtained by running Gen3Expansion.read_image_manifests()
+            cids(list): list of all the case IDs from the case TSV
+            cols(list): the columns required in the image manifest for the packaging script to run properly.
+        """
+        errors = []
+        for col in cols:
+            missing = len(idf[idf[col].isnull()])
+            if missing > 0:
+                error = "'{}' values issing for image manifest column '{}'.".format(
+                    len(missing), col
+                )
+                print(error)
+                errors.append(error)
+        if "case_ids" in idf:
+            icids = list(set(idf["case_ids"]))
+            extra_cids = list(set(icids).difference(cids))
+            if len(extra_cids) > 0:
+                error = "The image manifest TSV contains {} case IDs that are not present in the case TSV!".format(
+                    len(extra_cids)
+                )
+                print(error)
+                errors.append(error)
+        else:
+            error = "'case_ids' column missing from image manifest!"
+            print(error)
+            errors.append(error)
+        return errors
+
+    def summarize_new_batch(
+        self,
+        batch_tsvs,
+        dd,
+        outlier_threshold=10,
+        omit_props=[
+            "project_id",
+            "type",
+            "id",
+            "submitter_id",
+            "case_submitter_id",
+            "case_ids",
+            "visit_id",
+            "sample_id",
+            "md5sum",
+            "file_name",
+            "object_id",
+            "series_uid",
+            "study_uid",
+            "token_record_id",
+        ],
+        omit_nodes=["metaschema", "root", "program", "project", "data_release"],
+        outdir=".",
+        bin_limit=10,
+        write_report=True,
+        report_null=True,
+    ):
+        """
+        Summarizes a batch of MIDRC submission TSVs.
+        For each property in each batch submission TSVs, the total, non-null and null counts are returned.
+        For string, enumeration and boolean properties, bins and the number of unique bins are returned.
+        For integers and numbers, the mean, median, min, max, and stdev are returned.
+        Outliers in numeric data are identified using "+/- stdev". The cut-off for outlier identification can be changed by raising or lowering the outlier_threshold (common setting is ~3).
+
+        Args:
+            batch_tsvs(dict): dictionary of batch TSV names and filenames for a batch; output of "Gen3Expansion.sort_batch_tsvs()" script
+            dd(dict): data dictionary of the commons result of func Gen3Submission.get_dictionary_all()
+            outlier_threshold(number): The upper/lower threshold for identifying outliers in numeric data is the standard deviation multiplied by this number.
+            omit_props(list): Properties to omit from being summarized. It doesn't make sense to summarize certain properties, e.g., those with all unique values. May want to omit: ['sample_id','specimen_number','current_medical_condition_name','medical_condition_name','imaging_results','medication_name'].
+            omit_nodes(list): Nodes in the data dictionary to omit from being summarized, e.g., program, project, data_release, root and metaschema.
+            outdir(str): A directory for the output files.
+
+        Examples:
+            s = summarize_tsvs(batch_tsvs=batch_tsvs,
+                dd=dd,bin_limit=10)
+        """
+
+        summary = {}
+
+        report = pd.DataFrame(
+            columns=[
+                # "prop_id",
+                # "project_id",
+                "node",
+                "property",
+                "type",
+                "N",
+                "nn",
+                "null",
+                "perc_null",
+                "all_null",
+                "min",
+                "max",
+                "median",
+                "mean",
+                "stdev",
+                "outliers",
+                "bin_number",
+                "bins",
+            ]
+        )
+        report["all_null"] = report["all_null"].astype(bool)
+
+        nn_nodes, nn_props, null_nodes, null_props = [], [], [], []
+        # all_prop_ids = []
+
+        for node in batch_tsvs["node_tsvs"]:
+            filename = batch_tsvs["node_tsvs"][node]
+            df = pd.read_csv(filename, sep="\t", header=0, dtype=str)
+
+            if df.empty:
+                print("\t\t'{}' TSV is empty. No data to summarize.\n".format(node))
+
+            else:
+                nn_nodes.append(node)
+                prop_regex = re.compile(
+                    r"^[A-Za-z0-9_]*[^.]$"
+                )  # drop the links, e.g., cases.submitter_id or diagnoses.id (matches all properties with no ".")
+                props = list(
+                    filter(prop_regex.match, list(df))
+                )  # properties in this TSV to summarize
+                props = [
+                    prop for prop in props if prop not in omit_props
+                ]  # omit_props=['project_id','type','id','submitter_id','case_submitter_id','case_ids','visit_id','sample_id','md5sum','file_name','object_id']
+
+                # msg = "\t\tTotal of {} records in '{}' TSV with {} properties.".format(len(df),node,len(props))
+                # sys.stdout.write("\r"+str(msg))
+
+                for prop in props:  # prop=props[0]
+
+                    prop_name = "{}.{}".format(node, prop)
+                    # prop_id = "{}.{}".format(project_id, prop_name)
+                    print(prop_name)
+
+                    # because of sheepdog bug, need to inclue "None" in "null" (:facepalm:) https://ctds-planx.atlassian.net/browse/PXP-5663
+                    # df.at[df[prop] == "None", prop] = np.nan
+
+                    null = df.loc[df[prop].isnull()]
+                    nn = df.loc[df[prop].notnull()]
+                    perc_null = len(null) / len(df)
+                    ptype = self.get_prop_type(node, prop, dd)
+
+                    # dict for the prop's row in report dataframe
+                    prop_stats = {
+                        # "prop_id": prop_id,
+                        # "project_id": project_id,
+                        "node": node,
+                        "property": prop,
+                        "type": ptype,
+                        "N": len(df),
+                        "nn": len(nn),
+                        "null": len(null),
+                        "perc_null": perc_null,
+                        "all_null": np.nan,
+                        "min": np.nan,
+                        "max": np.nan,
+                        "median": np.nan,
+                        "mean": np.nan,
+                        "stdev": np.nan,
+                        "outliers": np.nan,
+                        "bin_number": np.nan,
+                        "bins": np.nan,
+                    }
+
+                    if nn.empty:
+                        null_props.append(prop_name)
+                        prop_stats["all_null"] = True
+
+                    else:
+                        nn_props.append(prop_name)
+                        # all_prop_ids.append(prop_id)
+                        prop_stats["all_null"] = False
+
+                        msg = "\t'{}'".format(prop_name)
+                        sys.stdout.write("\r" + str(msg).ljust(200, " "))
+
+                        if ptype in ["string", "enum", "array", "boolean", "date"]:
+
+                            if ptype == "array":
+
+                                all_bins = list(nn[prop])
+                                bin_list = [
+                                    bin_txt.split(",") for bin_txt in list(nn[prop])
+                                ]
+                                counts = Counter(
+                                    [item for sublist in bin_list for item in sublist]
+                                )
+
+                            elif ptype in ["string", "enum", "boolean", "date"]:
+
+                                counts = Counter(nn[prop])
+
+                            df1 = pd.DataFrame.from_dict(
+                                counts, orient="index"
+                            ).reset_index()
+                            bins = [tuple(x) for x in df1.values]
+                            bins = sorted(
+                                sorted(bins, key=lambda x: (x[0])),
+                                key=lambda x: (x[1]),
+                                reverse=True,
+                            )  # sort first by name, then by value. This way, names with same value are in same order.
+
+                            prop_stats["bins"] = bins
+                            prop_stats["bin_number"] = len(bins)
+
+                        # Get stats for numbers
+                        elif ptype in ["number", "integer"]:  # prop='concentration'
+
+                            # make a list of the data values as floats (converted from strings)
+                            nn_all = nn[prop]
+                            d_all = list(nn_all)
+
+                            nn_num = (
+                                nn[prop].apply(pd.to_numeric, errors="coerce").dropna()
+                            )
+                            d = list(nn_num)
+
+                            nn_string = nn.loc[~nn[prop].isin(list(map(str, d)))]
+                            non_numbers = list(nn_string[prop])
+
+                            if (
+                                len(d) > 0
+                            ):  # if there are numbers in the data, calculate numeric stats
+
+                                # calculate summary stats using the float list d
+                                mean = statistics.mean(d)
+                                median = statistics.median(d)
+                                minimum = min(d)
+                                maximum = max(d)
+
+                                if (
+                                    len(d) == 1
+                                ):  # if only one value, no stdev and no outliers
+                                    std = "NA"
+                                    outliers = []
+                                else:
+                                    std = statistics.stdev(d)
+                                    # Get outliers by mean +/- outlier_threshold * stdev
+                                    cutoff = (
+                                        std * outlier_threshold
+                                    )  # three times the standard deviation is default
+                                    lower, upper = (
+                                        mean - cutoff,
+                                        mean + cutoff,
+                                    )  # cut-offs for outliers is 3 times the stdev below and above the mean
+                                    outliers = sorted(
+                                        list(
+                                            set(
+                                                [x for x in d if x < lower or x > upper]
+                                            )
+                                        )
+                                    )
+
+                                # if property type is 'integer', change min, max, median to int type
+                                if ptype == "integer":
+                                    median = int(median)  # median
+                                    minimum = int(minimum)  # min
+                                    maximum = int(maximum)  # max
+                                    outliers = [
+                                        int(i) for i in outliers
+                                    ]  # convert outliers from float to int
+
+                                prop_stats["stdev"] = std
+                                prop_stats["mean"] = mean
+                                prop_stats["median"] = median
+                                prop_stats["min"] = minimum
+                                prop_stats["max"] = maximum
+                                prop_stats["outliers"] = outliers
+
+                            # check if numeric property is mixed with strings, and if so, summarize the string data
+                            if len(d_all) > len(d):
+
+                                msg = "\t\tFound {} string values among the {} records of prop '{}' with value(s): {}. Calculating stats only for the {} numeric values.".format(
+                                    len(non_numbers),
+                                    len(nn),
+                                    prop,
+                                    list(set(non_numbers)),
+                                    len(d),
+                                )
+                                print("\n\t{}\n".format(msg))
+
+                                prop_stats["type"] = "mixed {},string".format(ptype)
+
+                                counts = Counter(nn_string[prop])
+                                df1 = pd.DataFrame.from_dict(
+                                    counts, orient="index"
+                                ).reset_index()
+                                bins = [tuple(x) for x in df1.values]
+                                bins = sorted(
+                                    sorted(bins, key=lambda x: (x[0])),
+                                    key=lambda x: (x[1]),
+                                    reverse=True,
+                                )
+                                prop_stats["bins"] = bins
+                                prop_stats["bin_number"] = len(bins)
+
+                        else:  # If its not in the list of ptypes, exit. Need to add array handling.
+                            print(
+                                "\t\t\n\n\n\nUnhandled property type!\n\n '{}': {}\n\n\n\n".format(
+                                    prop_name, ptype
+                                )
+                            )
+                            exit()
+
+                    if bin_limit and isinstance(
+                        prop_stats["bins"], list
+                    ):  # if bin_limit != False
+                        prop_stats["bins"] = prop_stats["bins"][: int(bin_limit)]
+
+                    # report = report.append(prop_stats, ignore_index=True)
+                    # print("\n{}\n".format(report))
+                    # print("\n{}\n".format(prop_stats))
+                    pdf = pd.DataFrame.from_records([prop_stats])
+                    pdf["all_null"] = pdf["all_null"].astype(bool)
+                    report = pd.concat([report, pdf])
+
+        if not report_null:  # if report_null == False
+            report = report.loc[report["all_null"] != True]
+
+        # strip the col names so we can sort the report
+        report.columns = report.columns.str.strip()
+        report.sort_values(by=["all_null", "node", "property"], inplace=True)
+
+        summary["report"] = report
+        #
+        # summary["all_prop_ids"] = all_prop_ids
+
+        # summarize all properties
+        nn_props = sorted(list(set(nn_props)))
+        summary["nn_props"] = nn_props
+
+        null_props = [prop for prop in null_props if prop not in nn_props]
+        summary["null_props"] = sorted(list(set(null_props)))
+
+        # summarize all nodes
+        nn_nodes = sorted(list(set(nn_nodes)))
+        summary["nn_nodes"] = nn_nodes
+
+        dd_regex = re.compile(r"[^_][A-Za-z0-9_]+")
+        dd_nodes = list(filter(dd_regex.match, list(dd)))
+        dd_nodes = [node for node in dd_nodes if node not in omit_nodes]
+        null_nodes = [node for node in dd_nodes if node not in nn_nodes]
+
+        summary["null_nodes"] = null_nodes
+
+        if write_report:  # write_report == True
+
+            self.create_output_dir(outdir=outdir)
+
+            outname = "data_summary_{}.tsv".format(batch_tsvs["batch"])
+            outname = "{}/{}".format(
+                outdir, outname
+            )  # ./data_summary_prod_tsvs_04272020.tsv
+
+            report.to_csv(outname, sep="\t", index=False, encoding="utf-8")
+            sys.stdout.write("\rReport written to file:".ljust(200, " "))
+            print("\n\t{}".format(outname))
+
+        return summary
+
+    #
+
+    def create_mock_files(
+        self,
+        project_id="DEV-test",
+        count=3,
+        prefix="mock_data_file",
+        file_format="dcm",
+        outdir=".",
+        msg="This is a mock data file for testing purposes. Delete me!",
+        write_tsv=True,
+    ):
+        """
+        Create some mock data file objects to use in QA / mock ups.
+        """
+        prog, proj = project_id.split("-")
+        authz = ["/programs/{}/projects/{}".format(prog, proj)]
+        acl = [prog, proj]
+
+        mfiles = {
+            "file_name": [],
+            "md5sum": [],
+            "file_size": [],
+            "object_id": [],
+            "storage_urls": [],
+            "acl": [],
+            "authz": [],
+        }
+        for i in range(count):
+            file_name = "{}_{}.{}".format(prefix, i + 1, file_format)
+            object_id = str(uuid.uuid4())
+            mfiles["file_name"].append(file_name)
+            mfiles["object_id"].append(object_id)
+            mfiles["authz"].append(authz)
+            mfiles["acl"].append(acl)
+
+            output = "{}/{}".format(outdir, file_name)
+            os.system("touch {}".format(output))
+            file_msg = "{} File {} of {}. {} with object_id {}.".format(
+                msg, i + 1, count, file_name, object_id
+            )
+            cmd = 'echo "{}" > {}'.format(file_msg, file_name)
+            os.system(cmd)
+
+            with open(output, "rb") as file_to_check:
+                file_contents = file_to_check.read()
+                # cmd = "!md5 mock_data_file_{}.{}".format(i+1,file_format))
+                md5 = hashlib.md5(
+                    file_contents
+                ).hexdigest()  # check in shell: !md5 mock_data_file_3.dcm
+
+            mfiles["md5sum"].append(md5)
+            mfiles["file_size"].append(os.stat(output).st_size)
+            urls = "s3://this-is-a-fake-url-for:{}".format(file_name)
+            mfiles["storage_urls"].append([urls])
+
+        return mfiles
+
+    def index_mock_files(self, mfiles):
+        """
+        Create indexd records for some fake / mock data files created by create_mock_files() func.
+        Args:
+            mfiles = {'file_name':[],'md5sum':[],"file_size":[],"object_id":[],"storage_urls":[],"acl":[],"authz":[]}
+        """
+        results = []
+        for i in range(len(mfiles["file_name"])):
+            print(
+                "Submitting {} to indexd at {}.".format(
+                    mfiles["file_name"][i], mfiles["object_id"][i]
+                )
+            )
+            res = self.create_record(
+                did=mfiles["object_id"][i],
+                hashes={"md5": mfiles["md5sum"][i]},
+                size=mfiles["file_size"][i],
+                urls=mfiles["storage_urls"][i],
+                file_name=mfiles["file_name"][i],
+                acl=mfiles["acl"][i],
+                authz=mfiles["authz"][i],
+            )
+            results.append(res)
+        return results
+
+    def create_mock_tsv(
+        self,
+        dd,
+        node,
+        count,
+        parent_tsvs=None,
+        outdir=".",
+        filename=None,
+        links=None,
+        project_id=None,
+        excluded_props=[
+            "id",
+            "submitter_id",
+            "type",
+            "project_id",
+            "created_datetime",
+            "updated_datetime",
+            "state",
+            "file_state",
+            "error_type",
+        ],
+        file_props=["file_name", "file_size", "md5sum", "object_id", "storage_urls"],
+        mfiles=None,
+        submit_tsv=False,
+        minimum=1,
+        maximum=20,
+    ):
+        """
+        Create mock / simulated data in a submission TSV for a node in the data dictionary.
+        Args:
+            dd (dict): the Gen3 data dictionary you get with Gen3Submission.get_dictionary_all()
+            node(str): the name of the node in the data dictionary
+            count(int): the number of records / rows to create in the submission TSV
+            parent_tsvs(dict): a dictionary of node names (keys) and filenames (values) containing the parent node submission TSV; if left blank, the function will not include link submitter_ids; e.g., parent_tsvs = {'cases':'case_mock_1.1.4.tsv'}
+            outdir(str): the local directory to write simulated TSV data to
+            filename(str): the filename to use, default is the name of the node
+            links(list): a list of links to include in the submission TSV
+            excluded_props(list): a list of properties in data dictionary to ignore / exclude from the TSV columns
+            mfiles(dict): a dictionary of mock data files created using func create_mock_files()
+            submit_tsv(boolean): if true, will use sdk to submit the file via sheepdog
+
+        ############################################################
+        ############################################################
+        # Use these settings for testing, comment out when actually running as function or in SDK.
+        ############################################################
+        dd = sub.get_dictionary_all()
+        dd_version = dd["_settings"]["_dict_version"]
+        node = 'cr_series_file'
+        count = 3
+        outdir = "/Users/christopher/Documents/Notes/MIDRC/annotations/sample_data/DEV-test/script_tsvs"
+        filename = "{}_mock_{}.tsv".format(node,dd_version) # override for testing, comment out
+        links = self.list_links(node, dd)
+
+        parent_tsvs = {link:"{}/{}_mock_{}.tsv".format(outdir,link_targets[link],dd_version) for link in links}
+        links = None # for testing comment this out later
+        ############################################################
+        ############################################################
+        ############################################################
+        """
+        # get the data dictionary and version number
+        dd_version = dd["_settings"]["_dict_version"]
+
+        data = {}
+        data["type"] = [node] * count
+        data["submitter_id"] = ["{}-{}".format(node, i + 1) for i in range(count)]
+
+        props = list(dd[node]["properties"])
+        props = list(set(props).difference(excluded_props))
+
+        # build list of link_names to filter excluded nodes out of links
+        if links is None:  # if user didn't specify the links to be used, use them all.
+            links = self.list_links(node, dd)
+            if "subgroup" in dd[node]["links"][0]:
+                link_targets = {
+                    i["name"]: i["target_type"]
+                    for i in dd[node]["links"][0]["subgroup"]
+                }
+            else:
+                link_targets = {
+                    i["name"]: i["target_type"] for i in dd[node]["links"]
+                }  # get targets to filter out excluded nodes
+
+        link_names = {}
+        for link in links:
+            (
+                props.remove(link) if link in props else False
+            )  # remove the links bc missing ".submitter_id", will add back below
+            if link == "projects":
+                link_name = "projects.code"
+            else:
+                target_type = link_targets[link]
+                link_name = "{}.submitter_id".format(link)
+            link_names[link] = link_name
+
+        # add links to data
+        for link in link_names:
+            link_name = link_names[link]
+            if link_name == "projects.code" and project_id is not None:
+                prog, proj = project_id.split("-", 1)
+                data[link_name] = [proj] * count
+            elif parent_tsvs is None:  #
+                data[link_name] = [np.nan] * count
+            else:
+                parent_tsv = parent_tsvs[link]
+                pdf = pd.read_csv(parent_tsv, sep="\t", header=0)
+                psids = list(set(pdf["submitter_id"]))
+                available_psids = cycle(psids)
+                data[link_name] = [next(available_psids) for i in range(count)]
+
+        if mfiles is not None:
+            props = list(set(props).difference(file_props))
+            if len(mfiles["file_name"]) != count:
+                print(
+                    "The number of mock data files provided in 'mfiles' ({}) does not match the 'count' provided ({})!".format(
+                        len(mfiles["file_name"]), count
+                    )
+                )
+            for file_prop in file_props:
+                if file_prop in mfiles:
+                    data[file_prop] = mfiles[file_prop]
+                else:
+                    print(
+                        "File property '{}' is missing from mfiles! \n\t{}".format(
+                            file_prop, list(mfiles)
+                        )
+                    )
+
+        for prop in props:
+            if prop == "file_name":
+                data["file_name"] = [
+                    sid + ".mock_filename.txt" for sid in data["submitter_id"]
+                ]
+            elif prop == "md5sum":
+                md5s = []
+                for i in range(count):
+                    md5 = str(hashlib.md5(b"test").hexdigest())
+                    md5s.append(md5)
+                data["md5sum"] = md5s
+            elif prop == "object_id":
+                # add blank column to fill later upon submission (will create indexd records to get object_ids)
+                data["object_id"] = [np.nan] * count
+                # object_ids = []
+                # for i in range(count):
+                # irec = index.create_blank(uploader="cgmeyer@uchicago.edu",file_name="thisisatest.filename")
+                # object_ids.append(irec['did'])
+                # OR
+                #     object_ids.append(str(uuid.uuid4())) # guids will need to be created in indexd later for sheepdog submission to work
+                # data['object_id'] = object_ids
+            elif "type" in dd[node]["properties"][prop]:
+                prop_type = dd[node]["properties"][prop]["type"]  # expected type
+                if prop_type == "array":
+                    if "items" in dd[node]["properties"][prop]:
+                        array_type = dd[node]["properties"][prop]["items"]
+                        if "type" in dd[node]["properties"][prop]["items"]:
+                            array_type = dd[node]["properties"][prop]["items"]["type"]
+                        if "minimum" in dd[node]["properties"][prop]["items"]:
+                            minimum = dd[node]["properties"][prop]["items"]["minimum"]
+                        if "maximum" in dd[node]["properties"][prop]["items"]:
+                            maximum = dd[node]["properties"][prop]["items"]["maximum"]
+                    if array_type == "string":
+                        data[prop] = ["test {}, test {}".format(prop, prop)] * count
+                        # array_values = ["test {}, test {}".format(prop,prop)] * count
+                        # data[prop] = ','.join(array_values)
+                    elif array_type == "integer":
+                        array_list = []
+                        for i in range(count):
+                            array_list.append(
+                                ",".join(
+                                    map(
+                                        str,
+                                        list(
+                                            np.random.randint(low=1, high=89, size=(2))
+                                        ),
+                                    )
+                                )
+                            )
+                        data[prop] = array_list
+                    elif array_type == "number":
+                        array_list = []
+                        for i in range(count):
+                            one_array = list(
+                                np.random.uniform(low=1, high=89, size=(2))
+                            )
+                            formatted_array = ["%.2f" % elem for elem in one_array]
+                            array_list.append(",".join(map(str, formatted_array)))
+                        data[prop] = array_list
+                    elif array_type == "enum":
+                        print("do something")
+                elif prop_type == "string":
+                    data[prop] = ["test " + prop] * count
+                elif prop_type == "boolean":
+                    available_types = cycle([True, False])
+                    data[prop] = [next(available_types) for i in range(count)]
+                elif prop_type == "integer":
+                    if "minimum" in dd[node]["properties"][prop]:
+                        minimum = dd[node]["properties"][prop]["minimum"]
+                    if "maximum" in dd[node]["properties"][prop]:
+                        maximum = dd[node]["properties"][prop]["maximum"]
+                    data[prop] = list(
+                        np.random.randint(low=minimum, high=maximum, size=(count))
+                    )
+                elif prop_type == "number":
+                    if "minimum" in dd[node]["properties"][prop]:
+                        minimum = dd[node]["properties"][prop]["minimum"]
+                    if "maximum" in dd[node]["properties"][prop]:
+                        maximum = dd[node]["properties"][prop]["maximum"]
+                    data[prop] = [
+                        "%.2f" % elem
+                        for elem in list(
+                            np.random.uniform(low=minimum, high=maximum, size=count)
+                        )
+                    ]
+
+            elif "enum" in dd[node]["properties"][prop]:
+                enums = dd[node]["properties"][prop]["enum"]
+                available_enums = cycle(enums)
+                # enum_values = ['a','b']
+                # available_enums = cycle(enum_values)
+                data[prop] = [next(available_enums) for i in range(count)]
+
+        # create a dataframe and save as a TSV
+        df = pd.DataFrame(data)
+
+        # save dataframe to TSV file
+        if filename is None:
+            filename = "{}_mock_{}.tsv".format(node, dd_version)
+
+        Path(outdir).mkdir(parents=True, exist_ok=True)
+
+        output = "{}/{}".format(outdir, filename)
+        df.to_csv(output, sep="\t", index=False)
+
+        if submit_tsv is True:
+            filename = "{}_mock_{}.tsv".format(node, dd_version)
+            output = "{}/{}".format(outdir, filename)
+            self.submit_file(project_id="DEV-test", filename=output)
+        return df
+
+    def create_mock_project(
+        self,
+        dd,
+        node_counts=None,
+        project_id=None,
+        outdir="mock_tsvs",
+        excluded_props=[
+            "id",
+            "submitter_id",
+            "type",
+            "project_id",
+            "created_datetime",
+            "updated_datetime",
+            "state",
+            "file_state",
+            "error_type",
+        ],
+        file_props=["file_name", "file_size", "md5sum", "object_id", "storage_urls"],
+        excluded_nodes=[],
+        submit_tsvs=False,
+    ):
+        """
+
+        Create mock / simulated data project for a list of nodes in the data dictionary. Ignores program/project root nodes, so make sure those exist first. This is a wrapper for the func Gen3Expansion.create_mock_tsv()
+        Args:
+            dd (dict): the Gen3 data dictionary you get with Gen3Submission.get_dictionary_all().
+            node_counts(dict): node_ids as keys, values is number of records to create for that node.
+                For example: {"case":3,"imaging_study":6}
+            project_id(str): If no project_id is provided, using the generic 'DEV-test' project_id
+            outdir(str): the local directory to write simulated TSV data to.
+            excluded_props(list): a list of properties in data dictionary to ignore / exclude from TSVs.
+            file_props(list): a list of file_properties to be simulated; unlikely to change from default.
+            excluded_nodes(list): a list of nodes to not create mock TSVs for.
+            submit_tsvs(boolean): if true, will use sdk to submit the DataFrames via sheepdog
+        """
+        dd_version = dd["_settings"]["_dict_version"]
+        if project_id is None:
+            print(
+                "\tNo 'project_id' provided; using the generic 'DEV-test' as the project_id."
+            )
+            project_id = "DEV-test"
+        prog, proj = project_id.split("-", 1)
+
+        # for the create_mock_tsv() func, we need "node", "count" and "parent_tsvs".
+
+        # Build node_counts if not provided; this gets us "node" and "count"
+        node_counts = None
+        if node_counts is None:
+            node_order = self.get_submission_order()
+            node_counts = {}
+            for node in node_order:
+                node_id = node[0]
+                node_count = node[1]
+                print(node_id)
+                if (
+                    node_id == "project" or node_id in excluded_nodes
+                ):  # skip project node
+                    continue
+                else:
+                    node_counts[node_id] = (
+                        node_count * node_count
+                    )  # get progressively larger counts as you go down in data model hierarchy
+            print(
+                "\tNo node_counts provided; using the following node_counts:\n\t{}".format(
+                    node_counts
+                )
+            )
+
+        # Now build "parent_tsvs" for each node in "node_counts":
+        all_parent_tsvs = {}
+        for node in node_counts:
+            print(node)
+            parent_tsvs = {}
+            node_links = dd[node]["links"][0]
+            if "subgroup" in node_links:
+                sublinks = node_links["subgroup"]
+                link_targets = {
+                    i["name"]: i["target_type"]
+                    for i in sublinks
+                    if i["target_type"] not in excluded_nodes
+                }
+                if (
+                    node_links["exclusive"] == True
+                ):  # check if subgroup links are exclusive
+                    random_link = random.choice(
+                        list(link_targets.items())
+                    )  # pick only one random link if exclusive
+                    link_targets = {random_link[0]: random_link[1]}
+            else:
+                link_targets = {
+                    i["name"]: i["target_type"]
+                    for i in dd[node]["links"]
+                    if i["target_type"] not in excluded_nodes
+                }  # get targets to filter out excluded nodes
+
+            for link in link_targets:
+                parent_tsvs[link] = "{}/{}_mock_{}.tsv".format(
+                    outdir, link_targets[link], dd_version
+                )
+            # print("\t\t{}".format(parent_tsvs))
+            all_parent_tsvs[node] = parent_tsvs
+
+        # Create the TSVs
+        for node in node_counts:
+            # Create the node TSV / DataFrame
+            df = self.create_mock_tsv(
+                dd=dd,
+                node=node,
+                count=node_counts[node],
+                parent_tsvs=all_parent_tsvs[node],
+                project_id=project_id,
+                outdir=outdir,
+            )
+            if submit_tsvs:
+                if "object_id" in df and df["object_id"].isnull().values.any():
+                    object_ids = []
+                    for i in range(len(df)):
+                        file_name = list(df["file_name"])[i]
+                        size = list(df["file_size"])[i]
+                        md5 = list(df["md5sum"])[i]
+                        try:
+                            irec = self.create_mock_indexd_record(
+                                file_name=file_name,
+                                md5=md5,
+                                size=size,
+                                project_id=project_id,
+                            )
+                        except Exception as e:
+                            print(
+                                "Couldn't create the indexd record for file {}:\n\t{}".format(
+                                    file_name, e
+                                )
+                            )
+                        object_id = irec["did"]
+                        object_ids.append(object_id)
+                    df["object_id"] = object_ids
+                d = self.submit_df(project_id=project_id, df=df, chunk_size=250)
+
+    def create_mock_indexd_record(
+        self,
+        file_name,
+        md5,
+        size,
+        project_id="DEV-test",
+        uploader="cgmeyer@uchicago.edu",
+    ):
+        """
+        Create a blank indexd record}
+        """
+        prog, proj = project_id.split("-", 1)
+        iurl = "{}/index/index".format(self._endpoint)
+        payload = {
+            "form": "object",
+            "file_name": file_name,
+            "hashes": {"md5": md5},
+            "size": size,
+            "authz": ["/programs/{}/projects/{}".format(prog, proj)],
+            "acl": [prog, proj],
+            "urls": ["s3://mock/bucket/{}".format(file_name)],
+            #'uploader':uploader
+        }
+        try:
+            res = requests.post(
+                iurl,
+                headers={"content-type": "application/json"},
+                auth=self._auth_provider,
+                data=json.dumps(payload),
+            )
+        except Exception as e:
+            print(
+                "\n\tError creating indexd record:\n{}\n{}\n{}\n".format(
+                    res, res.text, e
+                )
+            )
+        data = res.json()
+        return data
+
+    def create_blank_indexd_record(
+        self, uploader="cgmeyer@uchicago.edu", file_name=None
+    ):
+        """
+        Create a blank indexd record}
+        """
+        iurl = "{}/index/index/blank".format(self._endpoint)
+        payload = {"uploader": uploader, "file_name": file_name}
+        res = requests.post(
+            iurl,
+            headers={"content-type": "application/json"},
+            auth=self._auth_provider,
+            data=json.dumps(payload),
+        )
+        try:
+            data = res.json()
+            return data
+        except Exception as e:
+            print(
+                "\n\tNo json in indexd response:\n{}\n{}\n{}\n".format(res, res.text, e)
+            )
+            return res.text
+
+    def nuked(self, message="Deleted!"):
+        mushroom_cloud1 = """
+                 _.-^^---....,,--
+             _--                  --_
+            <                        >)
+        """
+        mushroom_cloud2 = """
+            |                         |
+             \._                   _./
+                ```--. . , ; .--'''
+                      | |   |
+                   .-=||  | |=-.
+                   `-=#$%&%$#=-'
+                      | ;  :|
+             _____.,-#%&$@%#&#~,._____
+        """
+        print(mushroom_cloud1)
+        print("\t\t{}".format(message))
+        print(mushroom_cloud2)
diff --git a/stress_testing/sd_stress.ipynb b/stress_testing/sd_stress.ipynb
new file mode 100644
index 00000000..6f0fd8d9
--- /dev/null
+++ b/stress_testing/sd_stress.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#Install required libraries\n",
+    "import pandas as pd\n",
+    "from gen3.submission import Gen3Submission\n",
+    "from gen3.auth import Gen3Auth\n",
+    "from gen3.query import Gen3Query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "--2025-02-26 18:17:44--  https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 235477 (230K) [text/plain]\n",
+      "Saving to: ‘expansion.py.1’\n",
+      "\n",
+      "     0K .......... .......... .......... .......... .......... 21% 12.1M 0s\n",
+      "    50K .......... .......... .......... .......... .......... 43% 11.0M 0s\n",
+      "   100K .......... .......... .......... .......... .......... 65% 18.7M 0s\n",
+      "   150K .......... .......... .......... .......... .......... 86% 21.5M 0s\n",
+      "   200K .......... .......... .........                       100% 12.2M=0.02s\n",
+      "\n",
+      "2025-02-26 18:17:45 (14.3 MB/s) - ‘expansion.py.1’ saved [235477/235477]\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "### download and import some custom Python scripts from https://github.com/cgmeyer/gen3sdk-python\n",
+    "# import os\n",
+    "#os.system(\"wget https://raw.githubusercontent.com/cgmeyer/gen3sdk-python/master/expansion/expansion.py\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from expansion import Gen3Expansion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Getting all project_ids you have access to in https://qa-midrc.planx-pla.net/\n"
+     ]
+    },
+    {
+     "ename": "Gen3AuthError",
+     "evalue": "Failed to get an access token from https://qa-midrc.planx-pla.net/user/credentials/cdis/access_token:\n<html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n</body>\r\n</html>\r\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mGen3AuthError\u001b[0m                             Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[43], line 9\u001b[0m\n\u001b[1;32m      7\u001b[0m query \u001b[38;5;241m=\u001b[39m Gen3Query(auth) \u001b[38;5;66;03m# query class\u001b[39;00m\n\u001b[1;32m      8\u001b[0m exp \u001b[38;5;241m=\u001b[39m Gen3Expansion(api,auth,sub) \u001b[38;5;66;03m# class with some custom scripts\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[43mexp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_project_ids\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Documents/git_projects_m1/sheepdog/stress_testing/expansion.py:217\u001b[0m, in \u001b[0;36mGen3Expansion.get_project_ids\u001b[0;34m(self, node, name)\u001b[0m\n\u001b[1;32m    215\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(queries) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m    216\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m query \u001b[38;5;129;01min\u001b[39;00m queries:\n\u001b[0;32m--> 217\u001b[0m         res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msub\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    218\u001b[0m         df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mjson_normalize(res[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproject\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m    219\u001b[0m         project_ids \u001b[38;5;241m=\u001b[39m project_ids \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproject_id\u001b[39m\u001b[38;5;124m\"\u001b[39m]))\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/gen3/submission.py:425\u001b[0m, in \u001b[0;36mGen3Submission.query\u001b[0;34m(self, query_txt, variables, max_tries)\u001b[0m\n\u001b[1;32m    423\u001b[0m tries \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m    424\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m tries \u001b[38;5;241m<\u001b[39m max_tries:\n\u001b[0;32m--> 425\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mrequests\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpost\u001b[49m\u001b[43m(\u001b[49m\u001b[43mapi_url\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mauth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_auth_provider\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtext\n\u001b[1;32m    426\u001b[0m     data \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mloads(output)\n\u001b[1;32m    428\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merrors\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m data:\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/requests/api.py:115\u001b[0m, in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m    103\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mpost\u001b[39m(url, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, json\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    104\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a POST request.\u001b[39;00m\n\u001b[1;32m    105\u001b[0m \n\u001b[1;32m    106\u001b[0m \u001b[38;5;124;03m    :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;124;03m    :rtype: requests.Response\u001b[39;00m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpost\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/requests/api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[1;32m     56\u001b[0m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[1;32m     58\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m sessions\u001b[38;5;241m.\u001b[39mSession() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[0;32m---> 59\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/requests/sessions.py:575\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m    562\u001b[0m \u001b[38;5;66;03m# Create the Request.\u001b[39;00m\n\u001b[1;32m    563\u001b[0m req \u001b[38;5;241m=\u001b[39m Request(\n\u001b[1;32m    564\u001b[0m     method\u001b[38;5;241m=\u001b[39mmethod\u001b[38;5;241m.\u001b[39mupper(),\n\u001b[1;32m    565\u001b[0m     url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    573\u001b[0m     hooks\u001b[38;5;241m=\u001b[39mhooks,\n\u001b[1;32m    574\u001b[0m )\n\u001b[0;32m--> 575\u001b[0m prep \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    577\u001b[0m proxies \u001b[38;5;241m=\u001b[39m proxies \u001b[38;5;129;01mor\u001b[39;00m {}\n\u001b[1;32m    579\u001b[0m settings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmerge_environment_settings(\n\u001b[1;32m    580\u001b[0m     prep\u001b[38;5;241m.\u001b[39murl, proxies, stream, verify, cert\n\u001b[1;32m    581\u001b[0m )\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/requests/sessions.py:484\u001b[0m, in \u001b[0;36mSession.prepare_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    481\u001b[0m     auth \u001b[38;5;241m=\u001b[39m get_netrc_auth(request\u001b[38;5;241m.\u001b[39murl)\n\u001b[1;32m    483\u001b[0m p \u001b[38;5;241m=\u001b[39m PreparedRequest()\n\u001b[0;32m--> 484\u001b[0m \u001b[43mp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    485\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mupper\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    486\u001b[0m \u001b[43m    \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    487\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfiles\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfiles\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    488\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    489\u001b[0m \u001b[43m    \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    490\u001b[0m \u001b[43m    \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmerge_setting\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    491\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdict_class\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mCaseInsensitiveDict\u001b[49m\n\u001b[1;32m    492\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    493\u001b[0m \u001b[43m    \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmerge_setting\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    494\u001b[0m \u001b[43m    \u001b[49m\u001b[43mauth\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmerge_setting\u001b[49m\u001b[43m(\u001b[49m\u001b[43mauth\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mauth\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    495\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcookies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmerged_cookies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    496\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhooks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmerge_hooks\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhooks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhooks\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    497\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    498\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m p\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/requests/models.py:371\u001b[0m, in \u001b[0;36mPreparedRequest.prepare\u001b[0;34m(self, method, url, headers, files, data, params, auth, cookies, hooks, json)\u001b[0m\n\u001b[1;32m    369\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_cookies(cookies)\n\u001b[1;32m    370\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_body(data, files, json)\n\u001b[0;32m--> 371\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprepare_auth\u001b[49m\u001b[43m(\u001b[49m\u001b[43mauth\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    373\u001b[0m \u001b[38;5;66;03m# Note that prepare_auth must be last to enable authentication schemes\u001b[39;00m\n\u001b[1;32m    374\u001b[0m \u001b[38;5;66;03m# such as OAuth to work on a fully prepared request.\u001b[39;00m\n\u001b[1;32m    375\u001b[0m \n\u001b[1;32m    376\u001b[0m \u001b[38;5;66;03m# This MUST go after prepare_auth. Authenticators could add a hook\u001b[39;00m\n\u001b[1;32m    377\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_hooks(hooks)\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/requests/models.py:602\u001b[0m, in \u001b[0;36mPreparedRequest.prepare_auth\u001b[0;34m(self, auth, url)\u001b[0m\n\u001b[1;32m    599\u001b[0m     auth \u001b[38;5;241m=\u001b[39m HTTPBasicAuth(\u001b[38;5;241m*\u001b[39mauth)\n\u001b[1;32m    601\u001b[0m \u001b[38;5;66;03m# Allow auth to make its changes.\u001b[39;00m\n\u001b[0;32m--> 602\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mauth\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    604\u001b[0m \u001b[38;5;66;03m# Update self to reflect the auth changes.\u001b[39;00m\n\u001b[1;32m    605\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m\u001b[38;5;241m.\u001b[39mupdate(r\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m)\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/gen3/auth.py:330\u001b[0m, in \u001b[0;36mGen3Auth.__call__\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    320\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, request):\n\u001b[1;32m    321\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Adds authorization header to the request\u001b[39;00m\n\u001b[1;32m    322\u001b[0m \n\u001b[1;32m    323\u001b[0m \u001b[38;5;124;03m    This gets called by the python.requests package on outbound requests\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    328\u001b[0m \n\u001b[1;32m    329\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 330\u001b[0m     request\u001b[38;5;241m.\u001b[39mheaders[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAuthorization\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_auth_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    331\u001b[0m     request\u001b[38;5;241m.\u001b[39mregister_hook(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresponse\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handle_401)\n\u001b[1;32m    332\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m request\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/gen3/auth.py:456\u001b[0m, in \u001b[0;36mGen3Auth._get_auth_value\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    449\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_get_auth_value\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    450\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Returns the Authorization header value for the request\u001b[39;00m\n\u001b[1;32m    451\u001b[0m \n\u001b[1;32m    452\u001b[0m \u001b[38;5;124;03m    This gets called when added the Authorization header to the request.\u001b[39;00m\n\u001b[1;32m    453\u001b[0m \u001b[38;5;124;03m    This fetches the access token from the refresh token if the access token is missing.\u001b[39;00m\n\u001b[1;32m    454\u001b[0m \n\u001b[1;32m    455\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 456\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbearer \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_access_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/gen3/auth.py:443\u001b[0m, in \u001b[0;36mGen3Auth.get_access_token\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    437\u001b[0m need_new_token \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    438\u001b[0m     \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_access_token\n\u001b[1;32m    439\u001b[0m     \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_access_token_info\n\u001b[1;32m    440\u001b[0m     \u001b[38;5;129;01mor\u001b[39;00m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m300\u001b[39m \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_access_token_info[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexp\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m    441\u001b[0m )\n\u001b[1;32m    442\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m need_new_token:\n\u001b[0;32m--> 443\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrefresh_access_token\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    444\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendpoint\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mhasattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mendpoint\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\n\u001b[1;32m    445\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    446\u001b[0m \u001b[38;5;66;03m# use cache\u001b[39;00m\n\u001b[1;32m    447\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_access_token\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/gen3/auth.py:372\u001b[0m, in \u001b[0;36mGen3Auth.refresh_access_token\u001b[0;34m(self, endpoint)\u001b[0m\n\u001b[1;32m    368\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_access_token \u001b[38;5;241m=\u001b[39m get_access_token_with_client_credentials(\n\u001b[1;32m    369\u001b[0m         endpoint, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client_credentials, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client_scopes\n\u001b[1;32m    370\u001b[0m     )\n\u001b[1;32m    371\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_refresh_token:\n\u001b[0;32m--> 372\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_access_token \u001b[38;5;241m=\u001b[39m \u001b[43mget_access_token_with_key\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_refresh_token\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    373\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    374\u001b[0m     logging\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m    375\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to refresh access token. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    376\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAuthorized API calls will stop working when this token expires.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    377\u001b[0m     )\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/gen3/auth.py:77\u001b[0m, in \u001b[0;36mget_access_token_with_key\u001b[0;34m(api_key)\u001b[0m\n\u001b[1;32m     75\u001b[0m resp \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mpost(auth_url, json\u001b[38;5;241m=\u001b[39mapi_key)\n\u001b[1;32m     76\u001b[0m token_key \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maccess_token\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 77\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_handle_access_token_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken_key\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.pyenv/versions/3.9.13/envs/sheepdog_3913/lib/python3.9/site-packages/gen3/auth.py:58\u001b[0m, in \u001b[0;36m_handle_access_token_response\u001b[0;34m(resp, token_key)\u001b[0m\n\u001b[1;32m     56\u001b[0m err_msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to get an access token from \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m resp\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m200\u001b[39m:\n\u001b[0;32m---> 58\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m Gen3AuthError(err_msg\u001b[38;5;241m.\u001b[39mformat(resp\u001b[38;5;241m.\u001b[39murl, resp\u001b[38;5;241m.\u001b[39mtext))\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     60\u001b[0m     json_resp \u001b[38;5;241m=\u001b[39m resp\u001b[38;5;241m.\u001b[39mjson()\n",
+      "\u001b[0;31mGen3AuthError\u001b[0m: Failed to get an access token from https://qa-midrc.planx-pla.net/user/credentials/cdis/access_token:\n<html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n</body>\r\n</html>\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initiate instances of the Gen3 SDK Classes using credentials file downloaded from https://staging.midrc.org/identity\n",
+    "# You can view the SDK code/functions in GitHub: https://github.com/uc-cdis/gen3sdk-python\n",
+    "api = 'https://qa-midrc.planx-pla.net/'\n",
+    "cred = 'midrc_api_key.json'\n",
+    "auth = Gen3Auth(api, refresh_file=cred) # authentication class\n",
+    "sub = Gen3Submission(api, auth) # submission class\n",
+    "query = Gen3Query(auth) # query class\n",
+    "exp = Gen3Expansion(api,auth,sub) # class with some custom scripts\n",
+    "exp.get_project_ids()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File previously downloaded.\n",
+      "node_tsvs/imaging_study_tsvs/jnkns-jenkins_imaging_study.tsv has 810 records.\n",
+      "length of all dfs: 810\n",
+      "Master node TSV with 810 total records written to master_imaging_study.tsv.\n"
+     ]
+    }
+   ],
+   "source": [
+    "img_stud = exp.get_node_tsvs(projects = 'jnkns-jenkins', node='imaging_study')\n",
+    "\n",
+    "#increasing the volume of the number of records by 10 fold for testing purposes\n",
+    "img_stud_list = [img_stud.copy(deep=True) for i in range(10)]\n",
+    "for i in range(10):\n",
+    "    img_stud_list[i][\"submitter_id\"] = img_stud['submitter_id']+\"_\"+str(i) #changing the submitter_id to make them unique\n",
+    "\n",
+    "img_stud = pd.concat(img_stud_list)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "type\n",
+    "submitter_id\t\n",
+    "datasets\n",
+    "cases"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8100"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(img_stud)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8100\n"
+     ]
+    }
+   ],
+   "source": [
+    "img_stud = img_stud[['type', 'submitter_id', 'datasets.submitter_id', 'cases.submitter_id']]\n",
+    "\n",
+    "img_stud['datasets.submitter_id'] = 'dataset_backpedalled_usurers'\n",
+    "\n",
+    "\n",
+    "img_stud.to_csv('test_img_stud.tsv', sep='\\t', index=False)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Submitting test_img_stud.tsv with 8100 records.\n",
+      "Chunk 1 (chunk size: 1000, submitted: 0 of 8100)\n",
+      "\t Succeeded: 1000 entities.\n",
+      "Chunk 2 (chunk size: 1000, submitted: 1000 of 8100)\n",
+      "\t Succeeded: 1000 entities.\n",
+      "Chunk 3 (chunk size: 1000, submitted: 2000 of 8100)\n",
+      "\t Succeeded: 1000 entities.\n",
+      "Chunk 4 (chunk size: 1000, submitted: 3000 of 8100)\n",
+      "\t Succeeded: 1000 entities.\n",
+      "Chunk 5 (chunk size: 1000, submitted: 4000 of 8100)\n",
+      "\t Succeeded: 1000 entities.\n",
+      "Chunk 6 (chunk size: 1000, submitted: 5000 of 8100)\n",
+      "\t Succeeded: 1000 entities.\n",
+      "Chunk 7 (chunk size: 1000, submitted: 6000 of 8100)\n",
+      "\t Succeeded: 1000 entities.\n",
+      "Chunk 8 (chunk size: 1000, submitted: 7000 of 8100)\n",
+      "\t Succeeded: 1000 entities.\n",
+      "Chunk 9 (chunk size: 1000, submitted: 8000 of 8100)\n",
+      "\t Succeeded: 100 entities.\n",
+      "Finished data submission.\n",
+      "Successful records: 8100\n",
+      "Failed invalid records: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "start = time.time()\n",
+    "sub_test = sub.submit_file(project_id = 'jnkns-jenkins', filename = 'test_img_stud.tsv', chunk_size=1000)\n",
+    "end = time.time()\n",
+    "time_taken = end - start\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "time taken to push 810 recoords with sheepdog load_test_branch - 310.8567671775818\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "print(f\"time taken to push 810 recoords with sheepdog load_test_branch - {time_taken}\")\n",
+    "\n",
+    "#These are the values of the time taken to push 8100 records to the service\n",
+    "time_load_test_branch = 628.9082880020142  #But the chunk size was cut down to 500\n",
+    "time_load_test_branch_new_base = 641.9752421379089 #but service didn't fail after 1min, keeping chunk size at 1000\n",
+    "time_load_2501 = 299.92830085754395 # Benchnark time for 8100 records with tag 25.01\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sheepdog_3913",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}