Skip to content

Commit

Permalink
Merge pull request #92 from uhh-cms/feature/cf_multi_config_task
Browse files Browse the repository at this point in the history
MultiConfigPlotting and corresponding analysis changes
  • Loading branch information
mafrahm authored Nov 26, 2024
2 parents e90add5 + 2fc436f commit f6e2548
Show file tree
Hide file tree
Showing 13 changed files with 137 additions and 139 deletions.
13 changes: 13 additions & 0 deletions hbw/analysis/create_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
logger = law.logger.get_logger(__name__)


from hbw.config.defaults_and_groups import (
default_calibrator, default_selector, default_producers, default_ml_model,
ml_inputs_producer,
)


@timeit_multiple
def create_hbw_analysis(
name,
Expand Down Expand Up @@ -62,6 +68,13 @@ def create_hbw_analysis(
# (used in wrapper_factory)
analysis_inst.set_aux("config_groups", {})

analysis_inst.x.default_calibrator = default_calibrator(analysis_inst)
analysis_inst.x.default_selector = default_selector(analysis_inst)
analysis_inst.x.default_producer = default_producers
analysis_inst.x.default_weight_producer = "default"
analysis_inst.x.ml_inputs_producer = ml_inputs_producer(analysis_inst)
analysis_inst.x.default_ml_model = default_ml_model

#
# define configs
#
Expand Down
2 changes: 1 addition & 1 deletion hbw/categorization/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def catid_2b(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, a
"hh_ggf_hbb_hvvqqlnu_kl1_kt1", "hh_vbf_hbb_hvvqqlnu_kv1_k2v1_kl1",
"hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
"tt", "st", "w_lnu", "dy", "v_lep", "h",
"dy_m50toinf",
"dy_m50toinf", "tt_dl",
]
for proc in ml_processes:
@categorizer(
Expand Down
17 changes: 11 additions & 6 deletions hbw/config/defaults_and_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,16 @@ def set_config_defaults_and_groups(config_inst):
# Defaults
#

# NOTE: many of these have been moved to analysis_inst, TODO cleanup
# TODO: the default dataset is currently still being set up by the law.cfg
config_inst.x.default_dataset = default_signal_dataset = f"{default_signal_process}_{signal_generator}"
config_inst.x.default_calibrator = default_calibrator(config_inst)
config_inst.x.default_selector = default_selector(config_inst)
# config_inst.x.default_calibrator = default_calibrator(config_inst)
# config_inst.x.default_selector = default_selector(config_inst)
config_inst.x.ml_inputs_producer = ml_inputs_producer(config_inst)
config_inst.x.default_producer = default_producers
config_inst.x.default_weight_producer = "default"
# config_inst.x.default_weight_producer = "btag_not_normalized"
config_inst.x.default_ml_model = default_ml_model
# config_inst.x.default_producer = default_producers
# config_inst.x.default_weight_producer = "default"
# # config_inst.x.default_weight_producer = "btag_not_normalized"
# config_inst.x.default_ml_model = default_ml_model
config_inst.x.default_inference_model = "default" if year == 2017 else "sl_22"
config_inst.x.default_categories = ["incl"]
config_inst.x.default_variables = ["jet1_pt"]
Expand Down Expand Up @@ -205,6 +206,10 @@ def set_config_defaults_and_groups(config_inst):
"dilep": ["tt_*", "st_*", "dy_*", "w_lnu_*", "hh_ggf_*"],
"h": ["h_ggf_*", "h_vbf_*", "zh_*", "wph_*", "wmh_*", "tth_*", "ttzh_*", "ttwh_*"],
}
if config_inst.name == "l22post":
config_inst.x.dataset_groups["test123"] = ["tt_dl_powheg", "tt_sl_powheg"]
elif config_inst.name == "l22pre":
config_inst.x.dataset_groups["test123"] = ["tt_dl_powheg"]

# category groups for conveniently looping over certain categories
# (used during plotting and for rebinning)
Expand Down
14 changes: 14 additions & 0 deletions hbw/config/processes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from cmsdb.util import add_decay_process

from hbw.config.styling import color_palette


def add_parent_process(config: od.Config, child_procs: list[od.Process], **kwargs):
"""
Expand Down Expand Up @@ -185,3 +187,15 @@ def configure_hbw_processes(config: od.Config):
xsecs=None,
aux={"flavour": flavour},
)

# create main background process
background = config.add_process(
name="background",
id=99999,
label="background",
color=color_palette["blue"],
)
for bg in ["tt", "dy", "st", "vv", "w_lnu", "h"]:
if config.has_process(bg):
bg = config.get_process(bg)
background.add_process(bg)
33 changes: 17 additions & 16 deletions hbw/ml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,31 +196,32 @@ def setup(self):
# NOTE: since these variables are only used in ConfigTasks,
# we do not need to add these variables to all configs
for proc in self.processes:
if f"mlscore.{proc}" not in self.config_inst.variables:
self.config_inst.add_variable(
name=f"mlscore.{proc}",
null_value=-1,
binning=(1000, 0., 1.),
x_title=f"DNN output score {self.config_inst.get_process(proc).x.ml_label}",
aux={"rebin": 25}, # automatically rebin to 40 bins for plotting tasks
)

def preparation_producer(self: MLModel, config_inst: od.Config):
for config_inst in self.config_insts:
if f"mlscore.{proc}" not in config_inst.variables:
config_inst.add_variable(
name=f"mlscore.{proc}",
null_value=-1,
binning=(1000, 0., 1.),
x_title=f"DNN output score {config_inst.get_process(proc).x.ml_label}",
aux={"rebin": 25}, # automatically rebin to 40 bins for plotting tasks
)

def preparation_producer(self: MLModel, analysis_inst: od.Analysis):
""" producer that is run as part of PrepareMLEvents and MLEvaluation (before `evaluate`) """
return "ml_preparation"

def training_calibrators(self, config_inst: od.Config, requested_calibrators: Sequence[str]) -> list[str]:
def training_calibrators(self, analysis_inst: od.Analysis, requested_calibrators: Sequence[str]) -> list[str]:
# fix MLTraining Phase Space
# NOTE: since automatic resolving is not working here, we do it ourselves
return requested_calibrators or [config_inst.x.default_calibrator]
return requested_calibrators or [analysis_inst.x.default_calibrator]

def training_producers(self, config_inst: od.Config, requested_producers: Sequence[str]) -> list[str]:
def training_producers(self, analysis_inst: od.Analysis, requested_producers: Sequence[str]) -> list[str]:
# fix MLTraining Phase Space
# NOTE: might be nice to keep the "pre_ml_cats" for consistency, but running two
# categorization Producers in the same workflow is messy, so we skip it for now
# return requested_producers or ["event_weights", "pre_ml_cats", config_inst.x.ml_inputs_producer]
# return requested_producers or ["event_weights", config_inst.x.ml_inputs_producer]
return ["event_weights", config_inst.x.ml_inputs_producer]
# return requested_producers or ["event_weights", "pre_ml_cats", analysis_inst.x.ml_inputs_producer]
# return requested_producers or ["event_weights", analysis_inst.x.ml_inputs_producer]
return ["event_weights", analysis_inst.x.ml_inputs_producer]

def requires(self, task: law.Task) -> dict[str, Any]:
# Custom requirements (none currently)
Expand Down
36 changes: 19 additions & 17 deletions hbw/ml/derived/dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,22 +136,23 @@ def setup(self):
# NOTE: since these variables are only used in ConfigTasks,
# we do not need to add these variables to all configs
for proc in self.processes:
if f"mlscore.{proc}_manybins" not in self.config_inst.variables:
self.config_inst.add_variable(
name=f"mlscore.{proc}_manybins",
expression=f"mlscore.{proc}",
null_value=-1,
binning=(1000, 0., 1.),
x_title=f"DNN output score {self.config_inst.get_process(proc).x.ml_label}",
aux={"rebin": 40},
)
self.config_inst.add_variable(
name=f"mlscore40.{proc}",
expression=f"mlscore.{proc}",
null_value=-1,
binning=(40, 0., 1.),
x_title=f"DNN output score {self.config_inst.get_process(proc).x.ml_label}",
)
for config_inst in self.config_insts:
if f"mlscore.{proc}_manybins" not in config_inst.variables:
config_inst.add_variable(
name=f"mlscore.{proc}_manybins",
expression=f"mlscore.{proc}",
null_value=-1,
binning=(1000, 0., 1.),
x_title=f"DNN output score {config_inst.get_process(proc).x.ml_label}",
aux={"rebin": 40},
)
config_inst.add_variable(
name=f"mlscore40.{proc}",
expression=f"mlscore.{proc}",
null_value=-1,
binning=(40, 0., 1.),
x_title=f"DNN output score {config_inst.get_process(proc).x.ml_label}",
)


#
Expand Down Expand Up @@ -271,5 +272,6 @@ def setup(self):
dl_22.derive("dl_22_v1")
dl_22_limited = dl_22post.derive("dl_22_limited", cls_dict={
"training_configs": lambda self, requested_configs: ["l22pre", "l22post"],
"processes": ["hh_ggf_hbb_hvv2l2nu_kl1_kt1", "st_tchannel_t"],
"epochs": 4,
"processes": ["hh_ggf_hbb_hvv2l2nu_kl1_kt1", "tt_dl"],
})
3 changes: 2 additions & 1 deletion hbw/tasks/corrections.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from functools import cached_property

from columnflow.tasks.framework.base import Requirements
from columnflow.tasks.framework.base import Requirements, ConfigTask
from columnflow.tasks.framework.mixins import (
SelectorMixin, CalibratorsMixin,
)
Expand All @@ -25,6 +25,7 @@

class GetBtagNormalizationSF(
HBWTask,
ConfigTask,
SelectorMixin,
CalibratorsMixin,
# law.LocalWorkflow,
Expand Down
75 changes: 8 additions & 67 deletions hbw/tasks/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
CalibratorsMixin,
ProducersMixin,
MLModelTrainingMixin,
MLModelMixin,
MLModelsMixin,
MLModelDataMixin,
SelectorStepsMixin,
Expand Down Expand Up @@ -224,39 +223,23 @@ def workflow_requires(self):
self,
config=config_inst.name,
dataset=dataset_inst.name,
calibrators=_calibrators,
selector=_selector,
producers=_producers,
branch=-1,
)
for dataset_inst in dataset_insts
}
for (config_inst, dataset_insts), _calibrators, _selector, _producers in zip(
self.ml_model_inst.used_datasets.items(),
self.calibrators,
self.selectors,
self.producers,
)
for config_inst, dataset_insts in self.ml_model_inst.used_datasets.items()
}
reqs["stats"] = {
config_inst.name: {
dataset_inst.name: self.reqs.MergeMLStats.req(
self,
config=config_inst.name,
dataset=dataset_inst.name,
calibrators=_calibrators,
selector=_selector,
producers=_producers,
tree_index=-1,
)
for dataset_inst in dataset_insts
}
for (config_inst, dataset_insts), _calibrators, _selector, _producers in zip(
self.ml_model_inst.used_datasets.items(),
self.calibrators,
self.selectors,
self.producers,
)
for config_inst, dataset_insts in self.ml_model_inst.used_datasets.items()
}
return reqs

Expand All @@ -275,20 +258,12 @@ def requires(self):
self,
config=config_inst.name,
dataset=dataset_inst.name,
calibrators=_calibrators,
selector=_selector,
producers=_producers,
branch=-1,
)
for dataset_inst in dataset_insts
if dataset_inst.x.ml_process == process
}
for (config_inst, dataset_insts), _calibrators, _selector, _producers in zip(
self.ml_model_inst.used_datasets.items(),
self.calibrators,
self.selectors,
self.producers,
)
for config_inst, dataset_insts in self.ml_model_inst.used_datasets.items()
}

# load stats for all processes
Expand All @@ -298,18 +273,10 @@ def requires(self):
self,
config=config_inst.name,
dataset=dataset_inst.name,
calibrators=_calibrators,
selector=_selector,
producers=_producers,
tree_index=-1)
for dataset_inst in dataset_insts
}
for (config_inst, dataset_insts), _calibrators, _selector, _producers in zip(
self.ml_model_inst.used_datasets.items(),
self.calibrators,
self.selectors,
self.producers,
)
for config_inst, dataset_insts in self.ml_model_inst.used_datasets.items()
}

return reqs
Expand Down Expand Up @@ -435,13 +402,9 @@ def workflow_run(self):


class MLEvaluationSingleFold(
# NOTE: this should probably be a MLModelTrainingMixin, but I'll postpone this until the MultiConfigTask
# is implemented
# NOTE: mixins might need fixing, needs to be checked
HBWTask,
MLModelMixin,
ProducersMixin,
SelectorMixin,
CalibratorsMixin,
MLModelTrainingMixin,
law.LocalWorkflow,
RemoteWorkflow,
):
Expand Down Expand Up @@ -489,16 +452,10 @@ def workflow_requires(self):
self,
branches=(self.fold,),
configs=(self.config_inst.name,),
calibrators=(self.calibrators,),
selectors=(self.selector,),
producers=(self.producers,),
)
reqs["preml"] = self.reqs.MLPreTraining.req_different_branching(
self,
configs=(self.config_inst.name,),
calibrators=(self.calibrators,),
selectors=(self.selector,),
producers=(self.producers,),
branch=-1,
)
# reqs["preml"] = self.reqs.MLPreTraining.req_different_branching(self, branch=-1)
Expand All @@ -515,16 +472,10 @@ def requires(self):
branches=(self.fold,),
branch=self.fold,
configs=(self.config_inst.name,),
calibrators=(self.calibrators,),
selectors=(self.selector,),
producers=(self.producers,),
)
reqs["preml"] = self.reqs.MLPreTraining.req_different_branching(
self,
configs=(self.config_inst.name,),
calibrators=(self.calibrators,),
selectors=(self.selector,),
producers=(self.producers,),
branch=-1,
)
# reqs["preml"] = self.reqs.MLPreTraining.req_different_branching(self, branch=-1)
Expand Down Expand Up @@ -585,13 +536,9 @@ def run(self):


class PlotMLResultsSingleFold(
# NOTE: this should probably be a MLModelTrainingMixin, but I'll postpone this until the MultiConfigTask
# is implemented
# NOTE: mixins might need fixing, needs to be checked
HBWTask,
MLModelMixin,
ProducersMixin,
SelectorMixin,
CalibratorsMixin,
MLModelTrainingMixin,
law.LocalWorkflow,
RemoteWorkflow,
):
Expand Down Expand Up @@ -649,19 +596,13 @@ def requires(self):
"training": self.reqs.MLTraining.req_different_branching(
self,
configs=(self.config_inst.name,),
calibrators=(self.calibrators,),
selectors=(self.selector,),
producers=(self.producers,),
branches=(self.fold,),
),
}

reqs["preml"] = self.reqs.MLPreTraining.req_different_branching(
self,
configs=(self.config_inst.name,),
calibrators=(self.calibrators,),
selectors=(self.selector,),
producers=(self.producers,),
branch=-1,
)
reqs["mlpred"] = {
Expand Down
Loading

0 comments on commit f6e2548

Please sign in to comment.