Source code for cords.utils.data.dataloader.SL.nonadaptive.submoddataloader

import numpy as np
import apricot
import math
from .nonadaptivedataloader import NonAdaptiveDSSDataLoader
import torch
import time

[docs]class SubmodDataLoader(NonAdaptiveDSSDataLoader):
    # Currently split dataset with size of |max_chunk| then proportionably select samples in every chunk
    # Otherwise distance matrix will be too large
    """
    Implementation of SubmodDataLoader class for the nonadaptive submodular subset selection strategies for supervised learning setting.

    Parameters
    -----------
    train_loader: torch.utils.data.DataLoader class
        Dataloader of the training dataset
    val_loader: torch.utils.data.DataLoader class
        Dataloader of the validation dataset
    dss_args: dict
        Data subset selection arguments dictionary
    logger: class
        Logger for logging the information
    """
    def __init__(self, train_loader, val_loader, dss_args, logger, *args,
                 **kwargs):
        
        """
        Constructor function
        """
        # Arguments assertion
        assert "size_chunk" in dss_args.keys(), "'size_chunk' is a compulsory agument for submodular dataloader"
        self.size_chunk = dss_args.size_chunk
        self.dss_args = dss_args
        super(SubmodDataLoader, self).__init__(train_loader, val_loader, dss_args,
                                               logger, *args, **kwargs)
        self.logger.info("You are using max_chunk: %s", dss_args.size_chunk) 

    def _init_subset_indices(self): 
        """
        Initializes the subset indices and weights by calling the respective submodular function for data subset selection.
        """
        start_time = time.time()
        for i, (x, y) in enumerate(self.train_loader):
            if i == 0:
                if self.dss_args.data_type == 'text':
                    with torch.no_grad():
                        X = self.dss_args.model.embedding(x.to(self.device))
                    X = X.mean(dim = 1)
                    X = X.reshape(X.shape[0], -1)
                else:
                    X = x
                    X = X.reshape(X.shape[0], -1)
            else:
                if self.dss_args.data_type == 'text':
                    with torch.no_grad():
                        X_b = self.dss_args.model.embedding(x.to(self.device))
                    X_b = X_b.mean(dim = 1)
                    X_b = X_b.reshape(X_b.shape[0], -1)
                else:
                    X_b = x
                    X_b = X_b.reshape(X_b.shape[0], -1)
                X = torch.cat((X, X_b), dim=0)
        m = X.shape[0]
        X = X.to(device='cpu').numpy()
        # Chunking dataset to calculate pairwise distance with limited memory
        sample_indices = []
        size_chunk, budget = self.size_chunk, self.budget
        n_chunks = math.ceil(m / self.size_chunk)
        budget_chunk = math.ceil(budget / n_chunks)
        for i_chunk in range(n_chunks):
            l_idx = i_chunk * size_chunk
            r_idx = min(m, (i_chunk + 1) * size_chunk)
            n_samples = min(budget_chunk, budget - len(sample_indices))
            chunk = X[l_idx: r_idx, :]
            _sample_indices = self._chunk_select(chunk, n_samples)
            _sample_indices = [_sample_indice + l_idx for _sample_indice in _sample_indices]
            sample_indices += _sample_indices
        time_taken = time.time() - start_time
        self.logger.info("Submodular subset selection time is %.4f", time_taken)
        return np.array(sample_indices)


# Submodular optimization based
[docs]class FacLocDataLoader(SubmodDataLoader):
    """
    Implementation of FacLocDataLoader class for the nonadaptive facility location
    based subset selection strategy for supervised learning setting.

    Parameters
    -----------
    train_loader: torch.utils.data.DataLoader class
        Dataloader of the training dataset
    val_loader: torch.utils.data.DataLoader class
        Dataloader of the validation dataset
    dss_args: dict
        Data subset selection arguments dictionary
    logger: class
        Logger for logging the information
    """
    def __init__(self, train_loader, val_loader, dss_args, logger, *args,
                 **kwargs):
        
        super(FacLocDataLoader, self).__init__(train_loader, val_loader, dss_args,
                                               logger, *args, **kwargs)

    def _chunk_select(self, chunk, n_samples):
        """
        Function that selects the data samples by calling the facility location function.

        Parameters
        -----------
        chunk: numpy array
            Chunk of the input data from which the subset needs to be selected
        n_samples: int
            Number of samples that needs to be selected from input chunk
        Returns
        --------
        ranking: list
            Ranking of the samples based on the facility location gain 
        """
        f = apricot.functions.facilityLocation.FacilityLocationSelection(n_samples=n_samples)
        m = f.fit(chunk)
        return list(m.ranking)


[docs]class GraphCutDataLoader(SubmodDataLoader):
    """
    Implementation of GraphCutDataLoader class for the nonadaptive graph cut function
    based subset selection strategy for supervised learning setting.

    Parameters
    -----------
    train_loader: torch.utils.data.DataLoader class
        Dataloader of the training dataset
    val_loader: torch.utils.data.DataLoader class
        Dataloader of the validation dataset
    dss_args: dict
        Data subset selection arguments dictionary
    logger: class
        Logger for logging the information
    """

    def __init__(self, train_loader, val_loader, dss_args, logger, *args,
                 **kwargs):
        
        super(GraphCutDataLoader, self).__init__(train_loader, val_loader, dss_args,
                                               logger, *args, **kwargs)

    def _chunk_select(self, chunk, n_samples):
        """
        Function that selects the data samples by calling the graphcut function.

        Parameters
        -----------
        chunk: numpy array
            Chunk of the input data from which the subset needs to be selected
        n_samples: int
            Number of samples that needs to be selected from input chunk
        Returns
        --------
        ranking: list
            Ranking of the samples based on the graphcut gain 
        """
        f = apricot.functions.graphCut.GraphCutSelection(n_samples=n_samples)
        m = f.fit(chunk)
        return list(m.ranking)


[docs]class SumRedundancyDataLoader(SubmodDataLoader):
    """
    Implementation of SumRedundancyDataLoader class for the nonadaptive sum redundancy function
    based subset selection strategy for supervised learning setting.

    Parameters
    -----------
    train_loader: torch.utils.data.DataLoader class
        Dataloader of the training dataset
    val_loader: torch.utils.data.DataLoader class
        Dataloader of the validation dataset
    dss_args: dict
        Data subset selection arguments dictionary
    logger: class
        Logger for logging the information
    """
    def __init__(self, train_loader, val_loader, dss_args, logger, *args,
                 **kwargs):
        
        super(SumRedundancyDataLoader, self).__init__(train_loader, val_loader, dss_args,
                                               logger, *args, **kwargs)

    def _chunk_select(self, chunk, n_samples):
        """
        Function that selects the data samples by calling the sum redundancy function.

        Parameters
        -----------
        chunk: numpy array
            Chunk of the input data from which the subset needs to be selected
        n_samples: int
            Number of samples that needs to be selected from input chunk
        Returns
        --------
        ranking: list
            Ranking of the samples based on the sum redundancy gain 
        """
        f = apricot.functions.sumRedundancy.SumRedundancySelection(n_samples=n_samples)
        m = f.fit(chunk)
        return list(m.ranking)


[docs]class SaturatedCoverageDataLoader(SubmodDataLoader):
    """
    Implementation of SaturatedCoverageDataLoader class for the nonadaptive saturated coverage
    function based subset selection strategy for supervised learning setting.

    Parameters
    -----------
    train_loader: torch.utils.data.DataLoader class
        Dataloader of the training dataset
    val_loader: torch.utils.data.DataLoader class
        Dataloader of the validation dataset
    dss_args: dict
        Data subset selection arguments dictionary
    logger: class
        Logger for logging the information
    """
    def __init__(self, train_loader, val_loader, dss_args, logger, *args,
                 **kwargs):
        
        super(SaturatedCoverageDataLoader, self).__init__(train_loader, val_loader, dss_args,
                                               logger, *args, **kwargs)

    def _chunk_select(self, chunk, n_samples):
        """
        Function that selects the data samples by calling the saturated coverage function.

        Parameters
        -----------
        chunk: numpy array
            Chunk of the input data from which the subset needs to be selected
        n_samples: int
            Number of samples that needs to be selected from input chunk
        Returns
        --------
        ranking: list
            Ranking of the samples based on the saturated coverage gain 
        """
        f = apricot.functions.facilityLocation.FacilityLocationSelection(n_samples=n_samples)
        m = f.fit(chunk)
        return list(m.ranking)