Source code for wingbeats.modelling.hypertuning

"""Library for hyperparameter optimization functions"""



# Import libraries
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from tensorflow.keras.optimizers import Adam
#from uncertainties import unumpy

from wingbeats.modelling.builds import *
from wingbeats.modelling.metrics import embedding_similarity, embedding_loss
from wingbeats.modelling.metrics import predict_gen_spec

from wingbeats.processing.preprocessing import preprocess_dataset
from wingbeats.processing.postprocessing import compute_mean_conf_mat



[docs]def kfold_cv(X, y, models, genus_mapping, emb_matrix = None, n_splits = 4, epochs = 30, batch_size = 64, sm = None, model_callbacks = None, sampling_rate = 16000, window = None, nperseg = None, noverlap = None, cutoff = None): """Execute KFold Cross-Validation on dataset **(X, y)** which is to be split into **n_splits** stratified folds. \ User should provide **n_splits** models to be trained on each fold. \ Each model name should follow the pattern *architecture_inputFormat* (e.g. *HieraCls_spectro*). \ The **n_splits-1** folds held for training are also augmented using SMOTE, if **sm** is not *None*. :param X: Matrix of signals. :type X: list :param y: Label vector. :type y: list :param genus_mapping: List containing genus indexes ef every species. :type genus_mapping: list :param emb_matrix: Matrix of hierarchical embeddings. Defaults to *None*. Only needed for *Embedder* models. :type emb_matrix: array :param n_splits: Number of folds. Defaults to 4. :type n_splits: int :param epochs: Number of epochs to train models each fold. Defaults to 30. :type epochs: int :param batch_size: Size of one signal batch in tf.Dataset. Defaults to 64. :type batch_size: int :param sm: SMOTE object to augment data. Defaults to *None*. :type sm: imblearn.smote, optional :param model_callbacks: Callbacks for model training (e.g. Early Stopping, Model Checkpoint, Learning Rate Schedules). Defaults to *None*. :type model_callbacks: list :param sampling_rate: Sampling frequency. Defaults to 16000. :type sampling_rate: int :param window: Window-function to multiply each segment with i.e. 'hann' (for psd) or ``tf.signal.hann_window`` (for spectrograms). Defaults to *None*. :type window: str (for psd) or function pointer (for spectrograms) :param nperseg: Length of a segment for applying the Welch-Transform or STFT. Defaults to *None*. :type nperseg: int :param noverlap: Lenth of overlapping region between segments. Defaults to *None*. :type noverlap: int :param cutoff: How many PSD frequencies should be kept. Defaults to *None*. :type cutoff: int, optional :return: Mean and std. dev. confusion matrices over all folds for genus and species and list of training histories for every fold. """ # Create object for splitting dataset into folds skf = StratifiedKFold(n_splits = n_splits) gen_accuracies, spec_accuracies = [], [] # list of accuracies for every fold gen_conf_mats, spec_conf_mats = [], [] # list of confusion matrices per fold fold_histories = [] # list of histories for every fold fold_ind = 1 # fold counter # Induce model name and input format full_model_name = models[0].name substr_ind = full_model_name.find('_') model_name, input_format = full_model_name[:substr_ind], full_model_name[substr_ind+1:] if input_format not in ['raw', 'psd', 'spectro']: print('Input format unknown!') return None # Train and evaluate each fold for train_index, cv_index in skf.split(X, np.asarray(y)[:,1]): print('\nFOLD ' + str(fold_ind)) X_train_fold, X_cv_fold = [X[i] for i in train_index], [X[i] for i in cv_index] y_train_fold = np.asarray( [y[i] for i in train_index] ) y_cv_fold = np.asarray( [y[i] for i in cv_index] ) # Apply SMOTE (outputs will be Numpy in Colab and lists in Kaggle) if sm is not None: # Note: you can only input one-column y-vectors into sm; # if y_train has more columns, you need to reconstruct it after applying sm X_train_fold, y_train_smote = sm.fit_resample(X_train_fold, y_train_fold[:,1]) y_train_fold = [] for spec in y_train_smote: y_train_fold.append([genus_mapping[spec], spec]) # Convert datasets into the right formats train_set = preprocess_dataset(X_train_fold, y_train_fold, model_name, input_format, sampling_rate, batch_size, window, nperseg, noverlap, cutoff, shuffle = True, cache = False) cv_set = preprocess_dataset(X_cv_fold, y_cv_fold, model_name, input_format, sampling_rate, batch_size, window, nperseg, noverlap, cutoff, shuffle = False, cache = False) # Model fitting model = models[fold_ind-1] history = model.fit(train_set, epochs = epochs, validation_data = cv_set, callbacks = model_callbacks, verbose = 0) fold_histories.append(history.history) # Predictions + metrics evaluation pred_gens, pred_specs = predict_gen_spec(model, cv_set, model_name, genus_mapping, emb_matrix) correct_gens = sum(pred_gens == y_cv_fold[:, 0]) correct_specs = sum(pred_specs == y_cv_fold[:, 1]) gen_val_acc = correct_gens / len(pred_gens) spec_val_acc = correct_specs / len(pred_specs) gen_accuracies.append(gen_val_acc) spec_accuracies.append(spec_val_acc) # Build the confusion matrix # For genus conf_mat = np.round( confusion_matrix(y_cv_fold[:, 0], pred_gens, normalize = 'true'), 2 ) gen_conf_mats.append(conf_mat) # For species conf_mat = np.round( confusion_matrix(y_cv_fold[:, 1], pred_specs, normalize = 'true'), 2 ) spec_conf_mats.append(conf_mat) # Increase fold index fold_ind += 1 print('*********************************************') mean_acc, std_acc = np.round( np.mean(gen_accuracies), 4 ), np.round( np.std(gen_accuracies), 4 ) print('MEAN GENUS VAL_ACC: ' + str(mean_acc) + ' +/- ' + str(std_acc)) mean_acc, std_acc = np.round( np.mean(spec_accuracies), 4 ), np.round( np.std(spec_accuracies), 4 ) print('MEAN SPECIES VAL_ACC: ' + str(mean_acc) + ' +/- ' + str(std_acc)) # Compute mean confusion matrices gen_mean_mat, gen_std_mat = compute_mean_conf_mat(gen_conf_mats) spec_mean_mat, spec_std_mat = compute_mean_conf_mat(spec_conf_mats) # Note: unumpy.uarray(mean_mat, std_mat) gives you the confusion matrix in the form mean +/- std return gen_mean_mat, gen_std_mat, spec_mean_mat, spec_std_mat, fold_histories
#############################################################################################
[docs]def build_hyper_simple_classifier(hp, in_shape, out_shape, f_extractor, lr_values, reg_values, taxonomic_levels = ['species'], input_name = "input_signal", model_name = "Hyper_Simple_Classifier", strategy = None): """Build Simple Classifier for Hyperband-optimization. :param hp: Hyperband object. :type hp: kerastuner.hyperband :param in_shape: Input shape. No need to specify batch dimension. :type in_shape: tuple :param out_shape: Model output shape (here equal to the size of embedded taxonomic level). :type out_shape: tuple :param f_extractor: Feature extractor. :type f_extractor: tf.Layer :param lr_values: Discrete learning rate values. :type lr_values: list :param reg_values: Discrete regularization parameter values. :type reg_values: list :param taxonomic_levels: Taxonomic levels to include in the loss function. Defaults to ['species']. :type taxonomic_levels: list :param input_name: Name of the input. Defaults to 'input_signal'. :type input_name: str :param model_name: Name of the architecture. Defaults to 'Hyper_Simple_Classifier'. :type model_name: str :param strategy: Distribution strategy (CPU, GPU, TPU). Defaults to *None* (CPU). :type strategy: Strategy from tf.distribute, optional :return: Hyperband-optimizable Simple Classifier """ if strategy is None: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") # Hyperparameters hp_lr = hp.Choice('learning_rate', values = lr_values) hp_reg = hp.Choice('reg_param', values = reg_values) with strategy.scope(): model = build_simple_classifier(in_shape, out_shape, f_extractor, hp_reg, taxonomic_levels, input_name, model_name) model.compile(optimizer = Adam(hp_lr), loss = 'sparse_categorical_crossentropy', metrics = ['accuracy']) return model
#############################################################################################
[docs]def build_hyper_embedder(hp, in_shape, out_shape, f_extractor, lr_values, reg_values, emb_matrix, input_name = "input_signal", model_name = "Hyper_Embedder", strategy = None): """Build Embedder for Hyperband-optimization. :param hp: Hyperband object. :type hp: kerastuner.hyperband :param in_shape: Input shape. No need to specify batch dimension. :type in_shape: tuple :param out_shape: Model output shape (here equal to the size of embedded taxonomic level). :type out_shape: tuple :param f_extractor: Feature extractor. :type f_extractor: tf.Layer :param lr_values: Discrete learning rate values. :type lr_values: list :param reg_values: Discrete regularization parameter values. :type reg_values: list :param emb_matrix: Matrix of hierarchical embeddings. Defaults to *None*. :type emb_matrix: array :param input_name: Name of the input. Defaults to 'input_signal'. :type input_name: str :param model_name: Name of the architecture. Defaults to 'Hyper_Embedder'. :type model_name: str :param strategy: Distribution strategy (CPU, GPU, TPU). Defaults to *None* (CPU). :type strategy: Strategy from tf.distribute, optional :return: Hyperband-optimizable Embedder """ if strategy is None: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") # Hyperparameters hp_lr = hp.Choice('learning_rate', values = lr_values) hp_reg = hp.Choice('reg_param', values = reg_values) with strategy.scope(): model = build_embedder(in_shape, out_shape, f_extractor, hp_reg, input_name, model_name) model.compile(optimizer = Adam(hp_lr), loss = embedding_loss(emb_matrix), metrics = embedding_similarity(emb_matrix)) return model
#############################################################################################
[docs]def build_hyper_simple_embedder_classifier(hp, in_shape, out_shape, f_extractor, lr_values, reg_values, emb_matrix, taxonomic_levels = ['species'], input_name = "input_signal", model_name = "Hyper_Simple_Embedder_Classifier", strategy = None): """Build Simple Embedder Classifier for Hyperband-optimization. :param hp: Hyperband object. :type hp: kerastuner.hyperband :param in_shape: Input shape. No need to specify batch dimension. :type in_shape: tuple :param out_shape: Model output shape (here equal to the size of embedded taxonomic level). :type out_shape: tuple :param f_extractor: Feature extractor. :type f_extractor: tf.Layer :param lr_values: Discrete learning rate values. :type lr_values: list :param reg_values: Discrete regularization parameter values. :type reg_values: list :param emb_matrix: Matrix of hierarchical embeddings. Defaults to *None*. :type emb_matrix: array :param input_name: Name of the input. Defaults to 'input_signal'. :type input_name: str :param model_name: Name of the architecture. Defaults to 'Hyper_Simple_Embedder_Classifier'. :type model_name: str :param strategy: Distribution strategy (CPU, GPU, TPU). Defaults to *None* (CPU). :type strategy: Strategy from tf.distribute, optional :return: Hyperband-optimizable Simple Embedder Classifier """ if strategy is None: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") # Hyperparameters hp_lr = hp.Choice('learning_rate', values = lr_values) hp_reg = hp.Choice('reg_param', values = reg_values) with strategy.scope(): model = build_simple_embedder_classifier(in_shape, out_shape, f_extractor, hp_reg, taxonomic_levels, input_name, model_name) model.compile( optimizer = Adam(hp_lr), loss = { "embedding": embedding_loss(emb_matrix), "species": 'sparse_categorical_crossentropy' }, loss_weights = {"embedding": 1.0, "species": 1.0}, metrics = {"embedding": embedding_similarity(emb_matrix), "species": 'accuracy'}) return model
#############################################################################################
[docs]def build_hyper_hiera_classifier(hp, in_shape, out_shape, f_extractor, lr_values, reg_values, taxonomic_levels = ['genus', 'species'], parallel = True, input_name = "input_signal", model_name = "Hyper_Hiera_Classifier", strategy = None): """Build Hierarchical Classifier for Hyperband-optimization. :param hp: Hyperband object. :type hp: kerastuner.hyperband :param in_shape: Input shape. No need to specify batch dimension. :type in_shape: tuple :param out_shape: Model output shape (here equal to the size of embedded taxonomic level). :type out_shape: tuple :param f_extractor: Feature extractor. :type f_extractor: tf.Layer :param lr_values: Discrete learning rate values. :type lr_values: list :param reg_values: Discrete regularization parameter values. :type reg_values: list :param taxonomic_levels: Taxonomic levels to include in the loss function. Defaults to ['genus', 'species']. :type taxonomic_levels: list :param parallel: Whether to attach parallel Dense layers for every prediction. Otherwise, they are connected one after another. Defaults to *True*. :type parallel: bool :param input_name: Name of the input. Defaults to 'input_signal'. :type input_name: str :param model_name: Name of the architecture. Defaults to 'Hyper_Hiera_Classifier'. :type model_name: str :param strategy: Distribution strategy (CPU, GPU, TPU). Defaults to *None* (CPU). :type strategy: Strategy from tf.distribute, optional :return: Hyperband-optimizable Hierarchical Classifier """ if strategy is None: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") # Hyperparameters hp_lr = hp.Choice('learning_rate', values = lr_values) hp_reg = hp.Choice('reg_param', values = reg_values) with strategy.scope(): model = build_hiera_classifier(in_shape, out_shape, f_extractor, hp_reg, taxonomic_levels, parallel, input_name, model_name) model.compile( optimizer = Adam(hp_lr), loss = { "genus": 'sparse_categorical_crossentropy', "species": 'sparse_categorical_crossentropy', }, loss_weights = {"genus": 1.0, "species": 1.0}, metrics = {"genus": 'accuracy', "species": 'accuracy'}) return model
#############################################################################################
[docs]def build_hyper_hiera_embedder_classifier(hp, in_shape, out_shape, f_extractor, lr_values, reg_values, emb_matrix, taxonomic_levels = ['genus', 'species'], parallel = True, input_name = "input_signal", model_name = "Hyper_Hiera_Embedder_Classifier", strategy = None): """Build Hierarchical Embedder-Classifier for Hyperband-optimization. :param hp: Hyperband object. :type hp: kerastuner.hyperband :param in_shape: Input shape. No need to specify batch dimension. :type in_shape: tuple :param out_shape: Model output shape (here equal to the size of embedded taxonomic level). :type out_shape: tuple :param f_extractor: Feature extractor. :type f_extractor: tf.Layer :param lr_values: Discrete learning rate values. :type lr_values: list :param reg_values: Discrete regularization parameter values. :type reg_values: list :param emb_matrix: Matrix of hierarchical embeddings. Defaults to *None*. :type emb_matrix: array :param taxonomic_levels: Taxonomic levels to include in the loss function. Defaults to ['genus', 'species']. :type taxonomic_levels: list :param parallel: Whether to attach parallel Dense layers for every prediction. Otherwise, they are connected one after another. Defaults to *True*. :type parallel: bool :param input_name: Name of the input. Defaults to 'input_signal'. :type input_name: str :param model_name: Name of the architecture. Defaults to 'Hyper_Hiera_Embedder_Classifier'. :type model_name: str :param strategy: Distribution strategy (CPU, GPU, TPU). Defaults to *None* (CPU). :type strategy: Strategy from tf.distribute, optional :return: Hyperband-optimizable Hierarchical Embedder-Classifier """ if strategy is None: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") # Hyperparameters hp_lr = hp.Choice('learning_rate', values = lr_values) hp_reg = hp.Choice('reg_param', values = reg_values) with strategy.scope(): model = build_hiera_embedder_classifier(in_shape, out_shape, f_extractor, hp_reg, taxonomic_levels, parallel, input_name, model_name) model.compile( optimizer = Adam(hp_lr), loss = { "embedding": embedding_loss(emb_matrix), "genus": 'sparse_categorical_crossentropy', "species": 'sparse_categorical_crossentropy' }, loss_weights = {"embedding": 1.0, "genus": 1.0, "species": 1.0}, metrics = {"embedding": embedding_similarity(emb_matrix), "genus": 'accuracy', "species": 'accuracy'}) return model