Running on multiple phenotypes
This page shows how to script training for multiple phenotypes by programmatically updating your YAML config. We demonstrate a single phenotype first; you can then loop the same steps. Start by importing the required modules and defining a few phenotype-specific variables.
import yaml
import tempfile
import genomen.utils as utils
from genomen.data import DataSet, split
from genomen.model import GenomenModel
phenotype = "Basal metabolic rate"
phenotype_id = "INI23105"
classification = False Load your base config.
with open("config.yml", "r") as f:
configs = list(yaml.load_all(f, Loader=yaml.SafeLoader)) Then update the config with phenotype specific changes.
configs[0]["DataSetConfig"].update(
{
"phenotype_id": phenotype_id,
"classification": classification,
}
)
use_resid = (
configs[0]["DataSetConfig"]["covar_config"]["include_covars"]
and configs[1]["GenomenModelConfig"]["covar_config"]["covar_strat"] == "residualization"
)
configs[1]["GenomenModelConfig"]["geno_config"]["preprocessing_config"][
"feature_selection"
].update(
{"score_func": "chi2" if classification and not use_resid else "f_regression"}
)
configs[2]["TrainConfig"].update({"scorer": "rocauc" if classification else "r2"}) Write the full config to a temporary file and point GenomEn to it.
with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as temp_config:
yaml.safe_dump_all(configs, temp_config)
temp_config_path = temp_config.name
utils.set_config_path(temp_config_path) Finally, proceed as usual.
dataset = DataSet()
train_set, test_set, val_set = split(dataset, test_size=0.2)
model = GenomenModel()
model.fit(train_set, val_set) Looping over multiple phenotypes
To process many phenotypes, repeat the steps above inside a loop:
phenotypes = [
{"name": "Basal metabolic rate", "id": "INI23105", "classification": False},
{"name": "Asthma", "id": "HCXXXX", "classification": True},
# ...
]
for p in phenotypes:
phenotype, phenotype_id, classification = p["name"], p["id"], p["classification"]
with open("config.yml", "r") as f:
configs = list(yaml.load_all(f, Loader=yaml.SafeLoader))
configs[0]["DataSetConfig"].update(
{"phenotype_id": phenotype_id, "classification": classification}
)
use_resid = (
configs[0]["DataSetConfig"]["covar_config"]["include_covars"]
and configs[1]["GenomenModelConfig"]["covar_config"]["covar_strat"] == "residualization"
)
configs[1]["GenomenModelConfig"]["geno_config"]["preprocessing_config"]["feature_selection"].update(
{"score_func": "chi2" if classification and not use_resid else "f_regression"}
)
configs[2]["TrainConfig"].update({"scorer": "rocauc" if classification else "r2"})
with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as temp_config:
yaml.safe_dump_all(configs, temp_config)
utils.set_config_path(temp_config.name)
dataset = DataSet()
train_set, test_set, val_set = split(dataset, test_size=0.2)
model = GenomenModel()
model.fit(train_set, val_set)