Source code for src.superphot_plus.format_data_ztf

"""This script provides functions for importing, preprocessing, and
manipulating data related to ZTF lightcurves."""

import csv

import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold

from superphot_plus.file_paths import FITS_DIR
from superphot_plus.file_utils import get_multiple_posterior_samples, has_posterior_samples
from superphot_plus.supernova_class import SupernovaClass as SnClass


[docs]def import_labels_only(input_csvs, allowed_types, fits_dir=None, needs_posteriors=True, sampler=None):
    """Filters CSVs for rows where label is in allowed_types and returns
    names, labels.

    Parameters
    ----------
    input_csvs : list of str
        List of input CSV file paths.
    allowed_types : list
        List of allowed types for labels.
    fits_dir : str, optional
        Directory path for FITS files. Defaults to None.
    needs_posteriors: boolean, optional
        Indicates whether to load posterior samples.
    sampler : str, optional
        The sampler to get posteriors from.

    Returns
    -------
    tuple of np.ndarray
        Tuple of names, labels and redshifts.

    Notes
    -----
    Maps groups of similar labels to a single representative label name
    (eg, "SN Ic", "SNIc-BL", and "21" all become "SN Ibc").
    """
    if fits_dir is None:
        fits_dir = FITS_DIR

    labels = []
    labels_orig = []
    repeat_ct = 0
    names = []
    redshifts = []

    print(input_csvs, fits_dir, sampler)
    for input_csv in input_csvs:
        with open(input_csv, newline="", encoding="utf-8") as csvfile:
            csvreader = csv.reader(csvfile)
            next(csvreader)
            for row in csvreader:
                name = row[0]
                if needs_posteriors and not has_posterior_samples(
                    lc_name=name, fits_dir=fits_dir, sampler=sampler
                ):
                    continue
                label_orig = row[1]
                row_label = SnClass.canonicalize(label_orig)

                if row_label not in allowed_types:
                    continue

                if name not in names:
                    names.append(name)
                    labels.append(row_label)
                    labels_orig.append(label_orig)
                    redshifts.append(float(row[2]))
                else:
                    repeat_ct += 1

    tally_each_class(labels_orig)
    print(repeat_ct)

    return np.array(names), np.array(labels), np.array(redshifts)


[docs]def generate_K_fold(features, classes, num_folds):
    """Generates set of K test sets and corresponding training sets.

    Parameters
    ----------
    features: list
        Input features.
    classes: list
        Input classes.
    num_folds : int
        Number of folds. If -1, sets num_folds=len(features).

    Returns
    -------
    generator
        Generator yielding the indices for training and test sets.
    """
    if num_folds == -1:
        kf = StratifiedKFold(n_splits=len(features), shuffle=True)  # cross-one out validation
    else:
        kf = StratifiedKFold(n_splits=num_folds, shuffle=True)
    return kf.split(features, classes)


[docs]def tally_each_class(labels):
    """Prints the number of samples with each class label.

    Parameters
    ----------
    labels: list
        Input labels.
    """
    tally_dict = {}
    for label in labels:
        if label not in tally_dict:
            tally_dict[label] = 1
        else:
            tally_dict[label] += 1
    for tally_label, count in tally_dict.items():
        print(f"{tally_label}: {count}")
    print()


[docs]def oversample_using_posteriors(
    lc_names, labels, goal_per_class, fits_dir, sampler=None, redshifts=None, oversample_redshifts=False
):
    """Oversamples, drawing from posteriors of a certain fit.

    Parameters
    ----------
    lc_names : str
        Lightcurve names.
    labels : list
        List of labels.
    goal_per_class : int
        Number of samples per class.
    fits_dir : str
        Where fit parameters are stored.
    sampler : str, optional
        The name of the sampler to use.
    redshifts : list, optional
        List of redshift values.
    oversample_redshifts : boolean, optional
        Indicates whether to oversample redshifts.

    Returns
    -------
    tuple of np.ndarray
        Tuple containing oversampled features, labels, and redshifts.
    """

    oversampled_redshifts = []
    oversampled_labels = []
    oversampled_features = []
    labels_unique = np.unique(labels)

    labels = np.array(labels)

    posterior_samples = get_multiple_posterior_samples(lc_names, fits_dir, sampler)

    for l in labels_unique:
        idxs_in_class = np.asarray(labels == l).nonzero()[0]
        num_in_class = len(idxs_in_class)
        samples_per_fit = max(1, np.round(goal_per_class / num_in_class).astype(int))
        for i in idxs_in_class:
            lc_name = lc_names[i]
            all_posts = posterior_samples[lc_name]
            sampled_idx = np.random.choice(np.arange(len(all_posts)), samples_per_fit)
            sampled_features = all_posts[sampled_idx]
            oversampled_features.extend(list(sampled_features))
            oversampled_labels.extend([l] * samples_per_fit)
            if oversample_redshifts:
                oversampled_redshifts.extend([redshifts[i]] * samples_per_fit)

    return np.array(oversampled_features), np.array(oversampled_labels), np.array(oversampled_redshifts)


[docs]def normalize_features(features, mean=None, std=None):
    """Normalizes the features for feeding into the neural network.

    Parameters
    ----------
    features : numpy array
        Input features. Must be a 2-d array where each row corresponds
        to a data point and each entry to a feature.
    mean : ndarray, optional
        Mean values for normalization. Defaults to None.
    std : ndarray, optional
        Standard deviation values for normalization. Defaults to None.

    Returns
    -------
    tuple of np.ndarray
        Tuple containing normalized features, mean values, and standard
        deviation values.
    """
    if mean is None:
        mean = features.mean(axis=-2)
    if std is None:
        std = features.std(axis=-2)

    safe_std = np.copy(std)
    safe_std[std == 0.0] = 1.0
    return (features - mean) / safe_std, mean, std


[docs]def oversample_smote(features, labels):
    """
    Uses SMOTE to oversample data from rarer classes.
    """
    oversample = SMOTE()
    features_smote, labels_smote = oversample.fit_resample(features, labels)
    return features_smote, labels_smote