Source code for wingbeats.modelling.metrics

"""Library for custom metrics and losses"""



# Import libraries
import tensorflow as tf
import numpy as np



[docs]def embedding_similarity(emb_matrix):
    """Compute embedding similarity of current batch.
    
    :param emb_matrix: Matrix of hierarchical embeddings.
    :type emb_matrix: array
    :return: Function that computes similarity between true and predicted embeddings.
    """
    
    def similarity(y_true, pred_emb):
        """Compute similarity between true and predicted embeddings.
        
        :param y_true: True class indices
        :type y_true: list or array
        :param pred_emb: Output of the embedder (after l2-normalization).
        :type pred_emb: list or array
        :return: Dot products between true and predicted embeddings.
        """

        b_size = tf.shape(y_true)[0] # batch size (may vary at the end of a training step)

        # Get equivalent embeddings of the true labels (vectorized version)
        true_emb = tf.gather(emb_matrix, tf.cast(y_true, tf.int32), axis=0)
        true_emb = tf.squeeze(true_emb)

        # Embedding similarity as sum of dot products divided by batch size
        # Note: All variables need to be float32 in order to support MixedPrecision on GPU
        dot_products = tf.cast( tf.reduce_sum(pred_emb*true_emb), dtype = tf.float32 ) / tf.cast(b_size, dtype=tf.float32)
        
        return dot_products
    return similarity

#############################################################################################

[docs]def embedding_loss(emb_matrix):
    """Compute embedding loss of current batch as *1.0 - embedding_similarity*.
        
    :param emb_matrix: Matrix of hierarchical embeddings.
    :type emb_matrix: array
    :return: Function that computes the embedding loss w.r.t. true and predicted embeddings.
    """
    
    def emb_loss(y_true, pred_emb):

        return tf.constant(1.0, dtype=tf.float32) - embedding_similarity(emb_matrix)(y_true, pred_emb)
    return emb_loss
    
#############################################################################################

[docs]def focal_loss(gamma = 2.0):
    """Compute focal loss as modified cross entropy loss. The goal is to penalize hard examples harsher.
    
    :param gamma: Penalty exponent. If **gamma** is 0.0, the focal loss becomes the standard cross entropy loss. Defaults to 2.0.
    :type gamma: float
    :return: Focal loss function w.r.t. true and predicted probabilities.
    """
    
    def loss(y_true, y_pred):

        b_size = tf.shape(y_true)[0] # batch size (may vary at the end of a training step)

        # Get probabilities of true predictions
        true_prob = tf.gather(y_pred, tf.cast(y_true, tf.int32), axis=-1, batch_dims=1)
        
        # Note: All variables need to be float32 in order to support MixedPrecision on GPU       
        return -(1-true_prob)**gamma * tf.math.log(true_prob) / tf.cast(b_size, dtype=tf.float32)
    return loss

#############################################################################################

[docs]def predict_gen_spec(model, X, model_name, genus_mapping, emb_matrix):
    """Make genus and species predictions on dataset **X** according to model architecture.
    
    :param model: Pretrained classifier.
    :type model: tf.Model
    :param X: Matrix of signals.
    :type X: tf.Dataset
    :param model_name: Name of the architecture. Currently only allowed: *SimpleCls*, *Emb*, *SimpleEmbCls*, \
    *HieraCls*, *HieraEmbCls*.
    :type model_name: str
    :param genus_mapping: List that maps the index of the species to the index of the genus. 
    :type genus_mapping: list
    :param emb_matrix: Matrix of hierarchical embeddings. Only needed for *Emb*.
    :type emb_matrix: array
    :return: Predicted genus and species
    :rtype: tuple
    """
    
    if model_name == 'SimpleCls':
        pred_specs = np.argmax(model.predict(X, verbose = 0), axis = -1)
        pred_gens  = genus_mapping[pred_specs]
        
    elif model_name == 'Emb':
        pred_embs = model.predict(X, verbose = 0)
        pred_specs = get_species_from_embeddings(pred_embs, genus_mapping, emb_matrix)
        pred_gens  = genus_mapping[pred_specs]
    
    elif model_name == 'SimpleEmbCls':
        _, pred_specs = model.predict(X, verbose = 0)
        pred_specs = np.argmax(pred_specs, axis = -1)
        pred_gens  = genus_mapping[pred_specs]
        
    elif model_name == 'HieraCls':
        pred_gens, pred_specs = model.predict(X, verbose = 0)
        pred_gens  = np.argmax(pred_gens,  axis = -1)
        pred_specs = np.argmax(pred_specs, axis = -1)
        
    elif model_name == 'HieraEmbCls':
        _, pred_gens, pred_specs = model.predict(X, verbose = 0)
        pred_gens  = np.argmax(pred_gens,  axis = -1)
        pred_specs = np.argmax(pred_specs, axis = -1)    
    
    return pred_gens, pred_specs

#############################################################################################

[docs]def get_species_from_embeddings(pred_embs, genus_mapping, emb_matrix):
    """Infer predicted species from predicted embeddings.
    
    :param pred_embs: Predicted embeddings to be compared via *nearest neighbor* to the true embeddings.
    :type pred_embs: array
    :param genus_mapping: List that maps the index of the species to the index of the genus. 
    :type genus_mapping: list
    :param emb_matrix: Matrix of hierarchical embeddings. Only needed for *Emb*.
    :type emb_matrix: array
    :return: Predicted species.
    :rtype: list
    """
        
    pred_specs = []
    for pred_emb in pred_embs:

        # Distances to all fixed embeddings
        pred_emb_distances = []
        for emb in emb_matrix:
            pred_emb_distances.append( tf.linalg.norm(pred_emb - emb) )

        # Predicted species index = index of the smallest distance(pred_emb, emb_list)
        pred_spec = pred_emb_distances.index(min(pred_emb_distances))
        pred_specs.append(pred_spec)
        
    return pred_specs