Skip to content

Up-Down Scaling

Resampling

Each event has a different amount of point cloud lengths, making the data tensor filled with empty zeros where the point clouds are shorter. We require a static tensor with no zeros, the best way to do this is by choosing a target value—upscaling any events that are lower and downscaling any that are higher.

Bases: BaseEstimator, TransformerMixin

Parameters

target_size: (int) The number of points to up/down sample to

Returns

new_data: (array) Up/down sampled data with shape (run_events, target_size,4)

Source code in scripts/ml_preprocessing_steps.py
class UpDownScaling(BaseEstimator,TransformerMixin):
    """
    Parameters
    ----------
    target_size: (int) 
        The number of points to up/down sample to 

    Returns
    ----------
    new_data: (array)
        Up/down sampled data with shape (run_events, target_size,4)
    """
    def __init__(self,target_size: int,isotope: str,dimension: int = 4):
        self.target_size = target_size
        self.pcloud_zeros = 0 #count if there are zero points in an event
        self.dimension = dimension 
        self.isotope = isotope

    def fit(self,X,y=None):
        return self 

    def transform(self,X,y=None): #for up/down scaling
        """Resampling point clouds to a target value for a static array

        Args:
            X (tuple): data and event lengths np.array
            y (None): Defaults to None.

        Returns:
            (np.array): new data with modified shape
        """
        data,event_lengths = X #with shape (file,event_lenghts) X needs to be the only input to preserve the conventions of custom transformer
        len_run = len(data)
        # new_array_name = isotope + '_size' + str(sample_size) + '_sampled'
        new_data = np.full((len_run, self.target_size+2, self.dimension), np.nan) 

        for i in tqdm.tqdm(range(len_run), desc="Resampling data"): #
            ev_len = event_lengths[i] #length of event-- i.e. number of instances
            if ev_len == 0: #if event length is 0
                print(f"This event has 0 length: {i}")
                self.pcloud_zeros+=1
                continue
            if ev_len > self.target_size: #upsample
                random_points = np.random.choice(ev_len, self.target_size, replace=False)  #choosing the random instances to sample
                for r in range(len(random_points)):  # #only adds random sample_size points 
                    new_data[i,r] = data[i,random_points[r]]

            else:
                new_data[i,:ev_len,:] = data[i,:ev_len,:] #downsample
                need = self.target_size - ev_len
                random_points = np.random.choice(ev_len, need, replace= True if need > ev_len else False) #only repeats points more points needed than event length 
                count = ev_len
                for r in random_points:
                    new_data[i,count] = data[i,r]
                    if np.isnan(new_data[i, count, 0]):
                        print(f"NaN found at event {i}, index {count}") #need to make sure no nans remain
                    count += 1
            new_data[i,-2] = data[i,-2] # saving the label
            new_data[i,-1] = data[i,-1] # saving the event index


        assert self.pcloud_zeros == 0, "There are events with no points"
        assert new_data.shape == (len_run, self.target_size+2, self.dimension), 'Array has incorrect shape'
        assert len(np.unique(new_data[:,-1,0]))+self.pcloud_zeros == len_run, 'Array has incorrect number of events'
        assert not np.isnan(new_data).any(), "NaNs detected in new_data" #very imporant to make sure there are no nans 
        print(f"Transformed shape of data: {new_data.shape}")
        return new_data

transform(X, y=None)

Resampling point clouds to a target value for a static array

Parameters:

Name Type Description Default
X tuple

data and event lengths np.array

required
y None

Defaults to None.

None

Returns:

Type Description
array

new data with modified shape

Source code in scripts/ml_preprocessing_steps.py
def transform(self,X,y=None): #for up/down scaling
    """Resampling point clouds to a target value for a static array

    Args:
        X (tuple): data and event lengths np.array
        y (None): Defaults to None.

    Returns:
        (np.array): new data with modified shape
    """
    data,event_lengths = X #with shape (file,event_lenghts) X needs to be the only input to preserve the conventions of custom transformer
    len_run = len(data)
    # new_array_name = isotope + '_size' + str(sample_size) + '_sampled'
    new_data = np.full((len_run, self.target_size+2, self.dimension), np.nan) 

    for i in tqdm.tqdm(range(len_run), desc="Resampling data"): #
        ev_len = event_lengths[i] #length of event-- i.e. number of instances
        if ev_len == 0: #if event length is 0
            print(f"This event has 0 length: {i}")
            self.pcloud_zeros+=1
            continue
        if ev_len > self.target_size: #upsample
            random_points = np.random.choice(ev_len, self.target_size, replace=False)  #choosing the random instances to sample
            for r in range(len(random_points)):  # #only adds random sample_size points 
                new_data[i,r] = data[i,random_points[r]]

        else:
            new_data[i,:ev_len,:] = data[i,:ev_len,:] #downsample
            need = self.target_size - ev_len
            random_points = np.random.choice(ev_len, need, replace= True if need > ev_len else False) #only repeats points more points needed than event length 
            count = ev_len
            for r in random_points:
                new_data[i,count] = data[i,r]
                if np.isnan(new_data[i, count, 0]):
                    print(f"NaN found at event {i}, index {count}") #need to make sure no nans remain
                count += 1
        new_data[i,-2] = data[i,-2] # saving the label
        new_data[i,-1] = data[i,-1] # saving the event index


    assert self.pcloud_zeros == 0, "There are events with no points"
    assert new_data.shape == (len_run, self.target_size+2, self.dimension), 'Array has incorrect shape'
    assert len(np.unique(new_data[:,-1,0]))+self.pcloud_zeros == len_run, 'Array has incorrect number of events'
    assert not np.isnan(new_data).any(), "NaNs detected in new_data" #very imporant to make sure there are no nans 
    print(f"Transformed shape of data: {new_data.shape}")
    return new_data