enb/plugins/template_basic_workflow_example/basic_workflow.py

#!/usr/bin/env python3
"""Example showing the basic workflow of the ``enb`` library in a simple ATable subclass.
"""
__author__ = "Miguel Hernández-Cabronero"
__since__ = "2020/09/13"

import os
import glob
import shutil
import enb.atable
import enb.aanalysis


class WikiTable(enb.atable.ATable):
    # Methods that start with column_ are automatically recognized as such.
    # They just need to return the intended value.
    def column_line_count(self, file_path, row):
        with open(file_path, encoding="utf-8", mode="r") as input_file:
            return sum(1 for line in input_file)

    # More information about a column can be provided
    # Several column names and/or enb.atable.ColumnProperties instances
    # can also be passed as *args to the decorator to define multiple columns
    @enb.atable.column_function(
        enb.atable.ColumnProperties(name="word_count", label="Word count", plot_min=0),
        "status")
    def set_word_count(self, file_path, row):
        with open(file_path, mode="r", encoding="utf-8") as input_file:
            contents = input_file.read()
            row["word_count"] = len(contents.split())
            row["status"] = "dead" if "death_place" in contents.lower() else "alive"


def main():
    # Step 1: get a list of all data samples
    sample_paths = glob.glob("./data/wiki/*.txt")

    # Step 2: run experiment to gather data
    table = WikiTable() # csv_support_path="persistence_basic_workflow.csv")
    result_df = table.get_df(target_indices=sample_paths)

    # Show part of the returned table for the documentation.
    print(result_df[["index", "line_count"]])


    # Step 3: plot results
    #   Distribution of line counts
    analysis_df = enb.aanalysis.ScalarNumericAnalyzer().get_df(
        full_df=result_df,
        target_columns=["line_count"])
    os.makedirs("analysis", exist_ok=True)
    analysis_df.to_csv("analysis/line_count_analysis.csv")

    #   Distribution of word count grouped by status
    analysis_df = enb.aanalysis.ScalarNumericAnalyzer().get_df(
        full_df=result_df,
        target_columns=["word_count"],
        group_by="status",
        show_global=False)
    os.makedirs("analysis", exist_ok=True)
    analysis_df.to_csv("analysis/word_count_analysis.csv")

    #   Scatter plot: line count vs word count
    enb.aanalysis.TwoNumericAnalyzer().get_df(
        full_df=result_df,
        target_columns=[("line_count", "word_count")],
        column_to_properties=table.column_to_properties)


if __name__ == '__main__':
    main()