Data Augmentation

We have azimuthal symmetry within our data, that is, if we rotate the data about the z (beam) axis, the result conserves the physics. We can then "create" copies of any event by rotating it around beam axis by random angles between [0,2π].

Bases: BaseEstimator, TransformerMixin

Parameters

target_size: (int) Which is the number of point of the second dimension

Return

augmented_data: (np.array) Increased shape of array by the number of augmented events for class 3 and 4

Source code in scripts/ml_preprocessing_steps.py

class DataAugumentation(BaseEstimator,TransformerMixin):
    """
    Parameters
    ----------
    target_size: (int) 
        Which is the number of point of the second dimension

    Return
    ----------
    augmented_data: (np.array)
        Increased shape of array by the number of augmented events for class 3 and 4

    """
    def __init__(self,target_size):
        self.target_size = target_size

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        """Adding more data by creating copies with azimuthal symmetry around z-axis

        Args:
            X (np.array): data array 
            y (None): Defaults to None

        Returns:
            (np.array): augmented data array
        """
        labels = X[:,-2,0].astype(int)
        class_dist = np.array([np.sum(labels==i) for i in range(5)]) #there are 5 labels (0-5) 
        print(f"Data shape before data augmentation: {X.shape}")
        print(f"The class distribution before augmentation: {class_dist}")

        multipliers = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2}
        augmented_length = sum(class_dist[c] * m for c, m in multipliers.items()) #this will account for multiplier increase (ai helped here)
        augumented_data = np.full((augmented_length+len(X),X.shape[1],X.shape[2]),np.nan)
        augumented_data[:len(X)] = X #filling up the original events 
        new_start = len(X)
        current_idx = len(X)
        for i in range(len(X)):
            label = labels[i] 
            multiplier = multipliers[label]

            event = X[i]
            event_points = event[:-2]

            for j in range(multiplier):
                theta = np.random.uniform(0, 2 * np.pi) #rotation about the z-axis
                cos, sin= np.cos(theta), np.sin(theta) #need to get the conversion
                points_rot = event_points.copy() #don't want to change the original points 
                x, y = points_rot[:,0],points_rot[:,1] #original x and y points 
                points_rot[:,0] = cos * x - sin * y 
                points_rot[:,1] = sin * x + cos * y 

                augumented_data[current_idx] = np.concatenate([points_rot, event[-2:]], axis=0)
                current_idx+=1
        labels = augumented_data[:,-2,0].astype(int)
        class_dist = np.array([np.sum(labels==i) for i in range(5)]) #there are 5 labels (0-5) 
        print(f"The class distribution after augumentation: {class_dist}")


        return augumented_data

`transform(X, y=None)`

Adding more data by creating copies with azimuthal symmetry around z-axis

Parameters:

Name	Type	Description	Default
`X`	`array`	data array	required
`y`	`None`	Defaults to None	`None`

Returns:

Type	Description
`array`	augmented data array

Source code in scripts/ml_preprocessing_steps.py

def transform(self,X,y=None):
    """Adding more data by creating copies with azimuthal symmetry around z-axis

    Args:
        X (np.array): data array 
        y (None): Defaults to None

    Returns:
        (np.array): augmented data array
    """
    labels = X[:,-2,0].astype(int)
    class_dist = np.array([np.sum(labels==i) for i in range(5)]) #there are 5 labels (0-5) 
    print(f"Data shape before data augmentation: {X.shape}")
    print(f"The class distribution before augmentation: {class_dist}")

    multipliers = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2}
    augmented_length = sum(class_dist[c] * m for c, m in multipliers.items()) #this will account for multiplier increase (ai helped here)
    augumented_data = np.full((augmented_length+len(X),X.shape[1],X.shape[2]),np.nan)
    augumented_data[:len(X)] = X #filling up the original events 
    new_start = len(X)
    current_idx = len(X)
    for i in range(len(X)):
        label = labels[i] 
        multiplier = multipliers[label]

        event = X[i]
        event_points = event[:-2]

        for j in range(multiplier):
            theta = np.random.uniform(0, 2 * np.pi) #rotation about the z-axis
            cos, sin= np.cos(theta), np.sin(theta) #need to get the conversion
            points_rot = event_points.copy() #don't want to change the original points 
            x, y = points_rot[:,0],points_rot[:,1] #original x and y points 
            points_rot[:,0] = cos * x - sin * y 
            points_rot[:,1] = sin * x + cos * y 

            augumented_data[current_idx] = np.concatenate([points_rot, event[-2:]], axis=0)
            current_idx+=1
    labels = augumented_data[:,-2,0].astype(int)
    class_dist = np.array([np.sum(labels==i) for i in range(5)]) #there are 5 labels (0-5) 
    print(f"The class distribution after augumentation: {class_dist}")


    return augumented_data