StandardizationScaler
ADLStream.data.preprocessing.StandardizationScaler
The transformation is given by x_scaled = (x - avg_x) / x_stdev
where avg_x is the mean seen until now for the feature x and x_stdev is the standard deviation seen until now for the feature x.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
share_params |
bool |
Whether to share scaler parameters among columns. Defaults to False. |
False |
Source code in ADLStream/data/preprocessing/standardization_scaler.py
class StandardizationScaler(BasePreprocessor):
"""The transformation is given by
x_scaled = (x - avg_x) / x_stdev
where avg_x is the mean seen until now for the feature x and x_stdev
is the standard deviation seen until now for the feature x.
Arguments:
share_params (bool): Whether to share scaler parameters among columns.
Defaults to False.
"""
def __init__(self, share_params=False):
self.share_params = share_params
self.data = None
self.data_sum = None
self.data_stdev_sum = None
self.data_avg = None
self.data_stdev = None
self.data_count = 1
def _mean(self, a):
if self.share_params == False:
assert len(a) == len(self.data_sum)
self.data_sum = [self.data_sum[i] + a[i] for i in range(len(a))]
mean = [(self.data_sum[i]) / self.data_count for i in range(len(a))]
else:
self.data_sum += sum(a)
mean = [self.data_sum / (self.data_count * len(a))] * len(a)
return mean
def _standard_deviation(self, a):
"""In order to compute the standard deviation
uses the Welford's online algorithm given by
data_stdev_sum = old_data_stdev_sum + delta * delta2
stdev = sqrt(data_stdev_sum / (n - 1))
Arguments:
a (list): input data from stream generator.
Returns:
stev: standard deviation of the data.
"""
if self.share_params == False:
assert len(a) == len(self.data_sum)
mean = self.data_avg
if self.share_params == False:
delta = [(a[i] - mean[i]) for i in range(len(a))]
data_sum = [self.data_sum[i] + a[i] for i in range(len(a))]
mean = [(data_sum[i]) / self.data_count for i in range(len(a))]
delta2 = [(a[i] - mean[i]) for i in range(len(a))]
self.data_stdev_sum = [
(self.data_stdev_sum[i] + (delta[i] * delta2[i])) for i in range(len(a))
]
if self.data_count == 1:
stdev = [math.sqrt(self.data_stdev_sum[i]) for i in range(len(a))]
else:
stdev = [
math.sqrt(self.data_stdev_sum[i] / (self.data_count - 1))
for i in range(len(a))
]
else:
delta = [(a[i] - mean[i]) for i in range(len(a))]
data_sum = self.data_sum
data_sum += sum(a)
mean = [data_sum / (self.data_count * len(a))] * len(a)
delta2 = [(a[i] - mean[i]) for i in range(len(a))]
for i in range(len(a)):
self.data_stdev_sum += delta[i] * delta2[i]
stdev = [
math.sqrt(self.data_stdev_sum / (((self.data_count) * len(a)) - 1))
] * len(a)
return stdev
def learn_one(self, x):
"""Updates `avg` and `count` parameters for each feature
Args:
x (list): input data from stream generator.
Returns:
BasePreprocessor: self updated scaler.
"""
if self.data_sum is None:
self.data = [x]
self.data_avg = x
self.data_mean = x
self.data_sum = [0.0] * len(x)
self.data_stdev_sum = [0.0] * len(x)
if self.share_params == True:
self.data_sum = 0.0
self.data_stdev_sum = 0.0
self.data_avg = [
self.data_sum + sum(x) / (self.data_count * len(x))
] * len(x)
self.data_stdev = self._standard_deviation(x)
self.data_avg = self._mean(x)
self.data_count += 1
return self
def _standardization(self, val, avg_val, std_val):
def _safe_div_zero(a, b):
return 0 if b == 0 else a / b
return _safe_div_zero((val - avg_val), std_val)
def transform_one(self, x):
"""Scales one instance data
Args:
x (list): input data from stream generator.
Returns:
scaled_x (list): minmax scaled data.
"""
assert self.data_sum is not None
scaled_x = [
self._standardization(v, a, s)
for v, a, s in zip(x, self.data_avg, self.data_stdev)
]
return scaled_x
learn_one(self, x)
¶
Updates avg
and count
parameters for each feature
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
list |
input data from stream generator. |
required |
Returns:
Type | Description |
---|---|
BasePreprocessor |
self updated scaler. |
Source code in ADLStream/data/preprocessing/standardization_scaler.py
def learn_one(self, x):
"""Updates `avg` and `count` parameters for each feature
Args:
x (list): input data from stream generator.
Returns:
BasePreprocessor: self updated scaler.
"""
if self.data_sum is None:
self.data = [x]
self.data_avg = x
self.data_mean = x
self.data_sum = [0.0] * len(x)
self.data_stdev_sum = [0.0] * len(x)
if self.share_params == True:
self.data_sum = 0.0
self.data_stdev_sum = 0.0
self.data_avg = [
self.data_sum + sum(x) / (self.data_count * len(x))
] * len(x)
self.data_stdev = self._standard_deviation(x)
self.data_avg = self._mean(x)
self.data_count += 1
return self
transform_one(self, x)
¶
Scales one instance data
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
list |
input data from stream generator. |
required |
Returns:
Type | Description |
---|---|
scaled_x (list) |
minmax scaled data. |
Source code in ADLStream/data/preprocessing/standardization_scaler.py
def transform_one(self, x):
"""Scales one instance data
Args:
x (list): input data from stream generator.
Returns:
scaled_x (list): minmax scaled data.
"""
assert self.data_sum is not None
scaled_x = [
self._standardization(v, a, s)
for v, a, s in zip(x, self.data_avg, self.data_stdev)
]
return scaled_x