Running on multiple phenotypes

This page shows how to script training for multiple phenotypes by programmatically updating your YAML config. We demonstrate a single phenotype first; you can then loop the same steps. Start by importing the required modules and defining a few phenotype-specific variables.

import yaml
import tempfile
import genomen.utils as utils
from genomen.data import DataSet, split
from genomen.model import GenomenModel

phenotype = "Basal metabolic rate"
phenotype_id = "INI23105"
classification = False

Load your base config.

with open("config.yml", "r") as f:
    configs = list(yaml.load_all(f, Loader=yaml.SafeLoader))

Then update the config with phenotype specific changes.

configs[0]["DataSetConfig"].update(
    {
        "phenotype_id": phenotype_id,
        "classification": classification,
    }
)

use_resid = (
    configs[0]["DataSetConfig"]["covar_config"]["include_covars"]
    and configs[1]["GenomenModelConfig"]["covar_config"]["covar_strat"] == "residualization"
)
configs[1]["GenomenModelConfig"]["geno_config"]["preprocessing_config"][
    "feature_selection"
].update(
    {"score_func": "chi2" if classification and not use_resid else "f_regression"}
)

configs[2]["TrainConfig"].update({"scorer": "rocauc" if classification else "r2"})

Write the full config to a temporary file and point GenomEn to it.

with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as temp_config:
    yaml.safe_dump_all(configs, temp_config)
    temp_config_path = temp_config.name

utils.set_config_path(temp_config_path)

Finally, proceed as usual.

dataset = DataSet()
train_set, test_set, val_set = split(dataset, test_size=0.2)

model = GenomenModel()
model.fit(train_set, val_set)

Looping over multiple phenotypes

To process many phenotypes, repeat the steps above inside a loop:

phenotypes = [
    {"name": "Basal metabolic rate", "id": "INI23105", "classification": False},
    {"name": "Asthma", "id": "HCXXXX", "classification": True},
    # ...
]

for p in phenotypes:
    phenotype, phenotype_id, classification = p["name"], p["id"], p["classification"]

    with open("config.yml", "r") as f:
        configs = list(yaml.load_all(f, Loader=yaml.SafeLoader))

    configs[0]["DataSetConfig"].update(
        {"phenotype_id": phenotype_id, "classification": classification}
    )
    use_resid = (
        configs[0]["DataSetConfig"]["covar_config"]["include_covars"]
        and configs[1]["GenomenModelConfig"]["covar_config"]["covar_strat"] == "residualization"
    )
    configs[1]["GenomenModelConfig"]["geno_config"]["preprocessing_config"]["feature_selection"].update(
        {"score_func": "chi2" if classification and not use_resid else "f_regression"}
    )
    configs[2]["TrainConfig"].update({"scorer": "rocauc" if classification else "r2"})

    with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as temp_config:
        yaml.safe_dump_all(configs, temp_config)
        utils.set_config_path(temp_config.name)

    dataset = DataSet()
    train_set, test_set, val_set = split(dataset, test_size=0.2)

    model = GenomenModel()
    model.fit(train_set, val_set)