Skip to content

Utilities and helper functions

Unified harmonization methods for ML and DL.

load_MAREoS(effects=None, effect_types=None, effect_examples=None, as_numpy=True, data_dir=None, force_download=False, verbose=False)

Load multiple MAREoS datasets.

Parameters:

Name Type Description Default
effects list of str, str or None, optional (default None)

List of effects to load. If None, loads all ["eos", "true"]

None
effect_types list of str, str or None, optional (default None)

List of effect types to load. If None, loads all ["simple", "interaction"]

None
effect_examples list of str, str or None, optional (default None)

List of examples to load. If None, loads all ["1", "2"].

None
as_numpy bool, optional (default True)

If True, return numpy.ndarray, else pandas.DataFrame.

True
data_dir Path | None, optional (default None)

Directory containing MAREoS data files. If None, downloads to cache.

None
force_download bool, optional (default False)

Force to download again the dataset in case of corrupt files.

False
verbose bool, optional (default False)

Control verbosity.

False

Returns:

Type Description
dict of str and dict

Nested dictionary where keys are dataset names containing:

  • "X": Feature matrix
  • "y": Target labels
  • "sites": Site labels
  • "covs": Covariates
  • "folds": Cross-validation folds

Raises:

Type Description
ValueError

If any parameter contains invalid values.

Examples:

>>> datasets = load_MAREoS()
>>> len(datasets)
8
>>> datasets = load_MAREoS(effects=["eos"], effect_types=["simple"])
>>> len(datasets)
2
>>> list(datasets.keys())
['eos_simple1', 'eos_simple2']
Source code in src/uniharmony/datasets/_load_mareos.py
def load_MAREoS(  # noqa: N802
    effects: list[str] | str | None = None,
    effect_types: list[str] | str | None = None,
    effect_examples: list[str] | str | None = None,
    as_numpy: bool = True,
    data_dir: Path | None = None,
    force_download: bool = False,
    verbose: bool = False,
) -> dict[str, dict[str, pd.DataFrame | np.ndarray]]:
    """Load multiple MAREoS datasets.

    Parameters
    ----------
    effects : list of str, str or None, optional (default None)
        List of effects to load. If None, loads all ["eos", "true"]
    effect_types : list of str, str or None, optional (default None)
        List of effect types to load.
        If None, loads all ["simple", "interaction"]
    effect_examples : list of str, str or None, optional (default None)
        List of examples to load.
        If None, loads all ["1", "2"].
    as_numpy : bool, optional (default True)
        If True, return ``numpy.ndarray``, else ``pandas.DataFrame``.
    data_dir : Path | None, optional (default None)
        Directory containing MAREoS data files. If None, downloads to cache.
    force_download : bool, optional (default False)
        Force to download again the dataset in case of corrupt files.
    verbose : bool, optional (default False)
        Control verbosity.

    Returns
    -------
    dict of str and dict
        Nested dictionary where keys are dataset names
        containing:

        - "X": Feature matrix
        - "y": Target labels
        - "sites": Site labels
        - "covs": Covariates
        - "folds": Cross-validation folds

    Raises
    ------
    ValueError
        If any parameter contains invalid values.

    Examples
    --------
    >>> datasets = load_MAREoS()
    >>> len(datasets)
    8
    >>> datasets = load_MAREoS(effects=["eos"], effect_types=["simple"])
    >>> len(datasets)
    2
    >>> list(datasets.keys())
    ['eos_simple1', 'eos_simple2']

    """
    effects, effect_types, effect_examples = _validate_mareos_parameters(effects, effect_types, effect_examples)

    # Ensure all requested data is available
    data_dir = _ensure_mareos_data(data_dir, force_download, verbose)

    # Load all requested datasets
    dataset_dict = {}

    for effect in effects:
        for e_type in effect_types:
            for e_example in effect_examples:
                dataset_name = f"{effect}_{e_type}{e_example}"

                X, y, sites, covariates, folds = _load_mareos_single_dataset(
                    data_dir=data_dir,
                    effect=effect,
                    effect_type=e_type,
                    effect_example=e_example,
                    as_numpy=as_numpy,
                    verbose=verbose,
                )

                dataset_dict[dataset_name] = {
                    "X": X,
                    "y": y,
                    "sites": sites,
                    "covs": covariates,
                    "folds": folds,
                }

    return dataset_dict

make_multisite_classification(n_sites=2, n_samples=1000, balance_per_site=None, n_features=10, signal_strength=1.0, noise_strength=1.0, site_effect_strength=3.0, site_effect_homogeneous=True, n_classes=2, random_state=42, verbose=False)

Simulate multi-site data with signal, noise, and site effect components.

The data generation follows: X = signal + noise + site_effect All components are sampled from Gaussian distributions.

Parameters:

Name Type Description Default
n_sites int, optional (default 2)

Number of sites to simulate.

2
n_samples int, optional (default 1000)

Total number of samples across all sites.

1000
balance_per_site list of float or None, optional (default None)

Class balance for each site. If None, uses balanced classes (0.5 for binary, equal distribution for multi-class).

None
n_features int, optional (default 10)

Number of features per sample.

10
signal_strength float, optional (default 1.0)

Strength of the signal component separating classes.

1.0
noise_strength float, optional (default 1.0)

Strength of the noise component.

1.0
site_effect_strength float, optional (default 3.0)

Strength of site-specific effects.

3.0
site_effect_homogeneous bool, optional (default True)

Whether the site effect is homogeneous (same for all samples in a site).

True
n_classes int, optional (default 2)

Number of classes to simulate (2 for binary, >2 for multi-class).

2
random_state int or RandomState instance, (default 42)

The seed of the pseudo random number generator or RandomState for reproducibility.

42
verbose bool, optional (default False)

Whether to print progress information.

False

Returns:

Name Type Description
X np.ndarray of shape (n_samples, n_features)

Simulated feature matrix

y np.ndarray of shape (n_samples,)

Class labels (0 to n_classes-1)

site_labels np.ndarray of shape (n_samples,)

Site labels (0 to n_sites-1)

Examples:

>>> X, y, site_labels = make_multisite_classification(
...     n_sites=3, n_samples=300, n_features=20, n_classes=3
... )
>>> X.shape, y.shape, site_labels.shape
((300, 20), (300,), (300,))
Source code in src/uniharmony/datasets/_make_multisite_classification.py
def make_multisite_classification(
    n_sites: int = 2,
    n_samples: int = 1000,
    balance_per_site: list[float] | None = None,
    n_features: int = 10,
    signal_strength: float = 1.0,
    noise_strength: float = 1.0,
    site_effect_strength: float = 3.0,
    site_effect_homogeneous: bool = True,
    n_classes: int = 2,
    random_state: int | np.random.RandomState = 42,
    verbose: bool = False,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """Simulate multi-site data with signal, noise, and site effect components.

    The data generation follows: X = signal + noise + site_effect
    All components are sampled from Gaussian distributions.

    Parameters
    ----------
    n_sites : int, optional (default 2)
        Number of sites to simulate.
    n_samples : int, optional (default 1000)
        Total number of samples across all sites.
    balance_per_site : list of float or None, optional (default None)
        Class balance for each site. If None, uses balanced classes (0.5 for
        binary, equal distribution for multi-class).
    n_features : int, optional (default 10)
        Number of features per sample.
    signal_strength : float, optional (default 1.0)
        Strength of the signal component separating classes.
    noise_strength : float, optional (default 1.0)
        Strength of the noise component.
    site_effect_strength : float, optional (default 3.0)
        Strength of site-specific effects.
    site_effect_homogeneous : bool, optional (default True)
        Whether the site effect is homogeneous (same for all samples in a
        site).
    n_classes : int, optional (default 2)
        Number of classes to simulate (2 for binary, >2 for multi-class).
    random_state : int or RandomState instance, (default 42)
        The seed of the pseudo random number generator or RandomState for
        reproducibility.
    verbose : bool, optional (default False)
        Whether to print progress information.

    Returns
    -------
    X : np.ndarray of shape (n_samples, n_features)
        Simulated feature matrix
    y : np.ndarray of shape (n_samples,)
        Class labels (0 to n_classes-1)
    site_labels : np.ndarray of shape (n_samples,)
        Site labels (0 to ``n_sites``-1)

    Examples
    --------
    >>> X, y, site_labels = make_multisite_classification(
    ...     n_sites=3, n_samples=300, n_features=20, n_classes=3
    ... )
    >>> X.shape, y.shape, site_labels.shape
    ((300, 20), (300,), (300,))

    """
    random_state = check_random_state(random_state)

    # Validate input parameters
    _validate_parameters(
        n_sites=n_sites,
        n_samples=n_samples,
        n_features=n_features,
        signal_strength=signal_strength,
        noise_strength=noise_strength,
        site_effect_strength=site_effect_strength,
        n_classes=n_classes,
    )

    balance_per_site = _validate_balance_per_site(balance_per_site, n_sites, n_classes, verbose=verbose)

    # Allocate samples per site (even distribution)
    samples_per_site = np.full(n_sites, n_samples // n_sites, dtype=int)
    samples_per_site[: n_samples % n_sites] += 1

    # Pre-allocate arrays for better performance
    X_list = []
    y_list = []
    site_labels_list = []

    # Generate data for each site
    for site_idx in range(n_sites):
        n_site_samples = samples_per_site[site_idx]

        if verbose:
            logger.info(f"Generating {n_site_samples} samples for site {site_idx}")

        # Generate labels for this site
        if n_classes == 2:
            y_site = _generate_binary_labels(
                n_site_samples,
                balance_per_site[site_idx],
                random_state,
            )
        else:
            y_site = _generate_multiclass_labels(
                n_site_samples,
                balance_per_site[site_idx],
                n_classes,
                random_state,
            )

        # Generate signal component based on class labels
        signal = _generate_signal_component(
            y_site,
            n_features,
            signal_strength,
            n_classes,
            random_state,
        )

        # Generate noise component
        noise = random_state.normal(loc=0.0, scale=noise_strength, size=(n_site_samples, n_features))

        if site_effect_homogeneous:
            # Generate site effect (same for all samples in this site)
            site_effect = random_state.normal(
                loc=0.0,
                scale=site_effect_strength,
                size=(
                    1,
                    n_features,
                ),  # Same effect for all samples in this site
            )
        else:
            # Generate site effect (different for each sample in this site)
            site_effect = random_state.normal(
                loc=0.0,
                scale=site_effect_strength,
                size=(n_site_samples, n_features),
            )

        # Combine components: X = signal + noise + site_effect
        X_site = signal + noise + site_effect

        X_list.append(X_site)
        y_list.append(y_site)
        site_labels_list.extend([site_idx] * n_site_samples)

    # Concatenate all sites
    X = np.vstack(X_list)
    y = np.concatenate(y_list)
    site_labels = np.array(site_labels_list, dtype=int)

    # Shuffle samples across sites (optional but recommended)
    indices = random_state.permutation(len(X))
    X = X[indices]
    y = y[indices]
    site_labels = site_labels[indices]

    if verbose:
        print(f"Generated {len(X)} samples across {n_sites} sites")
        print(f"Class distribution: {np.bincount(y)}")
        print(f"Site distribution: {np.bincount(site_labels)}")

    return X, y, site_labels

verbosity(min_level='info')

Set verbosity level of logger.

Parameters:

Name Type Description Default
min_level (critical, error, warning, info, debug)

Minimum level to log.

"critical"

Raises:

Type Description
ValueError

If min_level value is invalid.

Source code in src/uniharmony/_verbose.py
def verbosity(min_level="info") -> None:
    """Set verbosity level of logger.

    Parameters
    ----------
    min_level : {"critical", "error", "warning", "info", "debug"}
        Minimum level to log.

    Raises
    ------
    ValueError
        If `min_level` value is invalid.

    """
    try:
        structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(min_level))
    except KeyError as e:
        lvls = ["critical", "error", "warning", "info", "debug"]
        raise ValueError(f"`min_level` needs to be one of: {lvls}") from e

verbosity_context(min_level='info')

Context manager to control the logger verbosity.

Parameters:

Name Type Description Default
min_level (critical, error, warning, info, debug)

Minimum level to log.

"critical"

Yields:

Type Description
None
Source code in src/uniharmony/_verbose.py
@contextmanager
def verbosity_context(min_level="info") -> None:
    """Context manager to control the logger verbosity.

    Parameters
    ----------
    min_level : {"critical", "error", "warning", "info", "debug"}
        Minimum level to log.

    Yields
    ------
    None

    """
    wrapper_cls = structlog.get_config()["wrapper_class"]
    verbosity(min_level)
    try:
        yield
    finally:
        structlog.configure(wrapper_class=wrapper_cls)