Merge pull request #824 from aishwariyachakraborty/new-branch

Semantic profiler and report generation module integration
IBM · Dec 4, 2024 · 299aa5f · 299aa5f
2 parents 6866f78 + 83c015c
commit 299aa5f
Show file tree

Hide file tree

Showing 54 changed files with 88,735 additions and 1,212 deletions.
diff --git a/transforms/code/code_profiler/README.md b/transforms/code/code_profiler/README.md
@@ -61,3 +61,13 @@ The high-level system design is as follows:
 For each new target language, the offline phase is utilized to create deterministic rules by harnessing the capabilities of LLMs and working with exemplar code samples from the target language. In this process, Workflow W1 facilitates the creation of rules around syntactic structures based on exemplar code samples, while Workflow W2 is used to establish semantic dimensions for profiling. Subsequently, we derive rules that connect syntactic constructs to the predefined semantic concepts. These rules are then stored in a rule database, ready to be employed during the online phase.
 
 In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports.
+
+The following runtimes are available:
+* [python](python/README.md) - provides the base python-based transformation 
+implementation and python runtime.
+* [ray](ray/README.md) - enables the running of the base python transformation
+in a Ray runtime
+
+Please refer to the playbook at `transforms/code/code_profiler/notebook_example/code-profiler.ipynb` to run the pythonic code profiler
+
+
diff --git a/transforms/code/code_profiler/input/data_profiler_params.json b/transforms/code/code_profiler/input/data_profiler_params.json
@@ -1,5 +1,6 @@
 {
     "input": "multi-package.parquet",
-    "contents": "Contents",
-    "language": "Language"
+    "dynamic_schema_mapping": "True",
+    "contents": "contents",
+    "language": "language"
 }
diff --git a/transforms/code/code_profiler/input/multi-package.parquet b/transforms/code/code_profiler/input/multi-package.parquet
diff --git a/transforms/code/code_profiler/notebook_example/code-profiler.ipynb b/transforms/code/code_profiler/notebook_example/code-profiler.ipynb
diff --git a/transforms/code/code_profiler/python/Makefile b/transforms/code/code_profiler/python/Makefile
@@ -35,7 +35,7 @@ setup:: .transforms.setup
 set-versions:
 	$(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE_PROFILER_PYTHON_VERSION) TOML_VERSION=$(CODE_PROFILER_PYTHON_VERSION) .transforms.set-versions 
 
-build-dist:: .defaults.build-dist 
+build-dist:: .defaults.build-dist
 
 publish-dist:: .defaults.publish-dist
 
@@ -51,5 +51,5 @@ run-local-sample: .transforms.run-local-sample
 
 run-local-python-sample:
 	$(MAKE) RUN_FILE=code_profiler_local_python.py \
-	RUN_ARGS="--content 'Contents' --language 'Language'" \
+	RUN_ARGS="--content 'contents' --language 'language'" \
 	.transforms.run-local-python-sample
diff --git a/transforms/code/code_profiler/python/README.md b/transforms/code/code_profiler/python/README.md
@@ -17,6 +17,17 @@ the options provided by
 the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
 
 ### Running the samples
+
+The code profiler can be run on mach-arm64 and x86_64 host architecture.
+Depending on your host architecture, please change the `RUNTIME_HOST_ARCH` in the Makefile.
+```
+# values possible mach-arm64, x86_64
+export RUNTIME_HOST_ARCH=x86_64
+```
+If you are using mac, you may need to permit your Mac to load the .so from the security settings. Generally, you get the pop-up under the tab security while running the transform.
+
+![alt text](image.png)
+
 To run the samples, use the following `make` targets
 
 * `run-local-sample` - runs src/code_profiler_local.py

diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml
@@ -7,6 +7,7 @@ license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
 authors = [
     { name = "Pankaj Thorat", email = "[email protected]" },
+    { name = "Aishwariya Chakraborty", email = "[email protected]" },
 ]
 
 dynamic = ["dependencies"]

diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt
@@ -95,4 +95,5 @@ tzdata==2024.1
 urllib3==2.2.2
 uuid
 wcwidth==0.2.13
-wrapt==1.16.0
+wrapt==1.16.0
+plotly==5.15.0
diff --git a/transforms/code/code_profiler/python/src/UAST_parser.py b/transforms/code/code_profiler/python/src/UAST_parser.py
@@ -228,8 +228,9 @@ def _add_user_defined(self, node):
         return
 
     # Traversing through the AST to create nodes recursively.
-    def _dfs(self, AST_node, parent) :
-        if (AST_node.type in self.rules) :
+    def _dfs(self, AST_node, parent):
+
+        if (AST_node.type in self.rules):
             ast_snippet = AST_node.text.decode("utf8")
             node_type = self.rules[AST_node.type]["uast_node_type"]
             exec_string = self.rules[AST_node.type]["extractor"]
@@ -269,3 +270,31 @@ def _extract(self, ast_snippet, node_type, exec_string):
             return self.grammar[node_type]["keyword"] + " " + self.extracted        
         except Exception as e:
             print(e)
+
+def uast_read(jsonstring):
+    """
+    Reads an input json string into UAST class object
+    """
+    uast = UAST()
+    if jsonstring is not None and jsonstring != 'null':
+        uast.load_from_json_string(jsonstring)
+        return uast
+    return None
+
+def extract_ccr(uast):
+    """
+    Calculates the code to comment ratio given an UAST object as input
+    """
+    if uast is not None:
+        total_comment_loc = 0
+        for node_idx in uast.nodes:
+            node = uast.get_node(node_idx)
+            if node.node_type == 'uast_comment':
+                total_comment_loc += node.metadata.get("loc_original_code", 0)
+            elif node.node_type == 'uast_root':
+                loc_snippet = node.metadata.get("loc_snippet", 0)
+        if total_comment_loc > 0:
+            return loc_snippet / total_comment_loc
+        else:
+            return None 
+    return None
diff --git a/transforms/code/code_profiler/python/src/code_profiler_local_python.py b/transforms/code/code_profiler/python/src/code_profiler_local_python.py
@@ -24,8 +24,8 @@
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
-    "contents": "Contents",
-    "language": "Language"
+    "contents": "contents",
+    "language": "language"
 }
 params = {
     # Data access. Only required parameters are specified