Skip to content

Data Limitation

Data Limitation

The model we are hoping to train a classification model, hence it is important to balance the classes for the purposes of avoid bias. We can do so simply by ensuring that each class has the same amount of training examples—by limiting them with the class with the lowest events.

Bases: BaseEstimator, TransformerMixin

Parameters

None

Return

limited_data: (np.array) All classes are not balanced through limitation

Source code in scripts/ml_preprocessing_steps.py
class DataLimitation(BaseEstimator,TransformerMixin):
    """
    Parameters
    ----------
    None

    Return
    ----------
    limited_data: (np.array)
        All classes are not balanced through limitation

    """

    def __init__(self, target_size):
        self.target_size = target_size

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        """Balancing the classes by limiting to the lowest class to remove any training bias

        Args:
            X (np.array): data array with reclassified labels
            y (None): Defaults to None.

        Returns:
            (np.array): limited data with balanced classes
        """
        labels = X[:,-2,0].astype(int)
        valid_mask = labels < 5
        X_valid = X[valid_mask]
        labels_valid = labels[valid_mask]

        unique_labels , counts_labels = np.unique(labels_valid, return_counts=True)

        print(f"Current class distribution is {counts_labels}")
        lowest_events = min(counts_labels)
        limiting_data = np.full((5*lowest_events,self.target_size + 2,4), np.nan)
        print(f"The new shape after limiting to lowest class{limiting_data.shape}")

        counters = [0, 0, 0, 0, 0]
        insert_index = 0

        for i in range(len(X_valid)):
            label = labels_valid[i]
            if counters[label] < lowest_events:
                limiting_data[insert_index] = X_valid[i] 
                counters[label] += 1
                insert_index += 1

            if all(c == lowest_events for c in counters):
                break

        labels_ll = limiting_data[:,-2,0].astype(int)
        _ , counts_labels_ll = np.unique(labels_ll, return_counts=True)
        print(f"Updated class distribution is {counts_labels_ll}")

        return limiting_data

transform(X, y=None)

Balancing the classes by limiting to the lowest class to remove any training bias

Parameters:

Name Type Description Default
X array

data array with reclassified labels

required
y None

Defaults to None.

None

Returns:

Type Description
array

limited data with balanced classes

Source code in scripts/ml_preprocessing_steps.py
def transform(self,X,y=None):
    """Balancing the classes by limiting to the lowest class to remove any training bias

    Args:
        X (np.array): data array with reclassified labels
        y (None): Defaults to None.

    Returns:
        (np.array): limited data with balanced classes
    """
    labels = X[:,-2,0].astype(int)
    valid_mask = labels < 5
    X_valid = X[valid_mask]
    labels_valid = labels[valid_mask]

    unique_labels , counts_labels = np.unique(labels_valid, return_counts=True)

    print(f"Current class distribution is {counts_labels}")
    lowest_events = min(counts_labels)
    limiting_data = np.full((5*lowest_events,self.target_size + 2,4), np.nan)
    print(f"The new shape after limiting to lowest class{limiting_data.shape}")

    counters = [0, 0, 0, 0, 0]
    insert_index = 0

    for i in range(len(X_valid)):
        label = labels_valid[i]
        if counters[label] < lowest_events:
            limiting_data[insert_index] = X_valid[i] 
            counters[label] += 1
            insert_index += 1

        if all(c == lowest_events for c in counters):
            break

    labels_ll = limiting_data[:,-2,0].astype(int)
    _ , counts_labels_ll = np.unique(labels_ll, return_counts=True)
    print(f"Updated class distribution is {counts_labels_ll}")

    return limiting_data