rushter
diff --git a/‎mla/datasets/base.py‎
Lines changed: 4 additions & 2 deletions b/‎mla/datasets/base.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎mla/ensemble/base.py‎
Lines changed: 9 additions & 2 deletions b/‎mla/ensemble/base.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎mla/ensemble/gbm.py‎
Lines changed: 12 additions & 2 deletions b/‎mla/ensemble/gbm.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎mla/ensemble/random_forest.py‎
Lines changed: 25 additions & 4 deletions b/‎mla/ensemble/random_forest.py‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎mla/ensemble/tree.py‎
Lines changed: 50 additions & 11 deletions b/‎mla/ensemble/tree.py‎
Lines changed: 50 additions & 11 deletions
diff --git a/‎mla/fm.py‎
Lines changed: 15 additions & 3 deletions b/‎mla/fm.py‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎mla/gaussian_mixture.py‎
Lines changed: 12 additions & 4 deletions b/‎mla/gaussian_mixture.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎mla/kmeans.py‎
Lines changed: 13 additions & 5 deletions b/‎mla/kmeans.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎mla/knn.py‎
Lines changed: 4 additions & 1 deletion b/‎mla/knn.py‎
Lines changed: 4 additions & 1 deletion
@@ -39,7 +39,9 @@ def load(dataset="training", digits=np.arange(10)):
         images = zeros((N, rows, cols), dtype=uint8)
         labels = zeros((N, 1), dtype=int8)
         for i in range(len(ind)):
-            images[i] = array(img[ind[i] * rows * cols: (ind[i] + 1) * rows * cols]).reshape((rows, cols))
+            images[i] = array(
+                img[ind[i] * rows * cols : (ind[i] + 1) * rows * cols]
+            ).reshape((rows, cols))
             labels[i] = lbl[ind[i]]
 
         return images, labels
@@ -64,7 +66,7 @@ def load_nietzsche():
     sentences = []
     next_chars = []
     for i in range(0, len(text) - maxlen, step):
-        sentences.append(text[i: i + maxlen])
+        sentences.append(text[i : i + maxlen])
         next_chars.append(text[i + maxlen])
 
     X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
 
@@ -14,13 +14,20 @@ def f_entropy(p):
 
 
 def information_gain(y, splits):
-    splits_entropy = sum([f_entropy(split) * (float(split.shape[0]) / y.shape[0]) for split in splits])
+    splits_entropy = sum(
+        [f_entropy(split) * (float(split.shape[0]) / y.shape[0]) for split in splits]
+    )
     return f_entropy(y) - splits_entropy
 
 
 def mse_criterion(y, splits):
     y_mean = np.mean(y)
-    return -sum([np.sum((split - y_mean) ** 2) * (float(split.shape[0]) / y.shape[0]) for split in splits])
+    return -sum(
+        [
+            np.sum((split - y_mean) ** 2) * (float(split.shape[0]) / y.shape[0])
+            for split in splits
+        ]
+    )
 
 
 def xgb_criterion(y, left, right, loss):
 
@@ -1,5 +1,6 @@
 # coding:utf-8
 import numpy as np
+
 # logistic function
 from scipy.special import expit
 
@@ -32,7 +33,9 @@ def hess(self, actual, predicted):
 
     def approximate(self, actual, predicted):
         """Approximate leaf value."""
-        return self.grad(actual, predicted).sum() / (self.hess(actual, predicted).sum() + self.regularization)
+        return self.grad(actual, predicted).sum() / (
+            self.hess(actual, predicted).sum() + self.regularization
+        )
 
     def transform(self, pred):
         """Transform predictions values."""
@@ -73,7 +76,14 @@ def transform(self, output):
 class GradientBoosting(BaseEstimator):
     """Gradient boosting trees with Taylor's expansion approximation (as in xgboost)."""
 
-    def __init__(self, n_estimators, learning_rate=0.1, max_features=10, max_depth=2, min_samples_split=10):
+    def __init__(
+        self,
+        n_estimators,
+        learning_rate=0.1,
+        max_features=10,
+        max_depth=2,
+        min_samples_split=10,
+    ):
         self.min_samples_split = min_samples_split
         self.learning_rate = learning_rate
         self.max_depth = max_depth
 
@@ -7,7 +7,14 @@
 
 
 class RandomForest(BaseEstimator):
-    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion=None):
+    def __init__(
+        self,
+        n_estimators=10,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        criterion=None,
+    ):
         """Base class for RandomForest.
 
         Parameters
@@ -44,15 +51,22 @@ def _train(self):
                 self.y,
                 max_features=self.max_features,
                 min_samples_split=self.min_samples_split,
-                max_depth=self.max_depth
+                max_depth=self.max_depth,
             )
 
     def _predict(self, X=None):
         raise NotImplementedError()
 
 
 class RandomForestClassifier(RandomForest):
-    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion="entropy"):
+    def __init__(
+        self,
+        n_estimators=10,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        criterion="entropy",
+    ):
         super(RandomForestClassifier, self).__init__(
             n_estimators=n_estimators,
             max_features=max_features,
@@ -85,7 +99,14 @@ def _predict(self, X=None):
 
 
 class RandomForestRegressor(RandomForest):
-    def __init__(self, n_estimators=10, max_features=None, min_samples_split=10, max_depth=None, criterion="mse"):
+    def __init__(
+        self,
+        n_estimators=10,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        criterion="mse",
+    ):
         super(RandomForestRegressor, self).__init__(
             n_estimators=n_estimators,
             max_features=max_features,
 
@@ -58,14 +58,24 @@ def _find_best_split(self, X, target, n_features):
                     gain = self.criterion(target["y"], splits)
                 else:
                     # Gradient boosting
-                    left, right = split_dataset(X, target, column, value, return_X=False)
+                    left, right = split_dataset(
+                        X, target, column, value, return_X=False
+                    )
                     gain = xgb_criterion(target, left, right, self.loss)
 
                 if (max_gain is None) or (gain > max_gain):
                     max_col, max_val, max_gain = column, value, gain
         return max_col, max_val, max_gain
 
-    def _train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01):
+    def _train(
+        self,
+        X,
+        target,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        minimum_gain=0.01,
+    ):
         try:
             # Exit from recursion using assert syntax
             assert X.shape[0] > min_samples_split
@@ -86,22 +96,43 @@ def _train(self, X, target, max_features=None, min_samples_split=10, max_depth=N
             self.impurity = gain
 
             # Split dataset
-            left_X, right_X, left_target, right_target = split_dataset(X, target, column, value)
+            left_X, right_X, left_target, right_target = split_dataset(
+                X, target, column, value
+            )
 
             # Grow left and right child
             self.left_child = Tree(self.regression, self.criterion, self.n_classes)
             self.left_child._train(
-                left_X, left_target, max_features, min_samples_split, max_depth - 1, minimum_gain
+                left_X,
+                left_target,
+                max_features,
+                min_samples_split,
+                max_depth - 1,
+                minimum_gain,
             )
 
             self.right_child = Tree(self.regression, self.criterion, self.n_classes)
             self.right_child._train(
-                right_X, right_target, max_features, min_samples_split, max_depth - 1, minimum_gain
+                right_X,
+                right_target,
+                max_features,
+                min_samples_split,
+                max_depth - 1,
+                minimum_gain,
             )
         except AssertionError:
             self._calculate_leaf_value(target)
 
-    def train(self, X, target, max_features=None, min_samples_split=10, max_depth=None, minimum_gain=0.01, loss=None):
+    def train(
+        self,
+        X,
+        target,
+        max_features=None,
+        min_samples_split=10,
+        max_depth=None,
+        minimum_gain=0.01,
+        loss=None,
+    ):
         """Build a decision tree from training set.
 
         Parameters
@@ -131,11 +162,16 @@ def train(self, X, target, max_features=None, min_samples_split=10, max_depth=No
             self.loss = loss
 
         if not self.regression:
-            self.n_classes = len(np.unique(target['y']))
-
-        self._train(X, target, max_features=max_features, min_samples_split=min_samples_split,
-                    max_depth=max_depth, minimum_gain=minimum_gain)
+            self.n_classes = len(np.unique(target["y"]))
 
+        self._train(
+            X,
+            target,
+            max_features=max_features,
+            min_samples_split=min_samples_split,
+            max_depth=max_depth,
+            minimum_gain=minimum_gain,
+        )
 
     def _calculate_leaf_value(self, targets):
         """Find optimal value for leaf."""
@@ -149,7 +185,10 @@ def _calculate_leaf_value(self, targets):
                 self.outcome = np.mean(targets["y"])
             else:
                 # Probability for classification task
-                self.outcome = np.bincount(targets["y"], minlength=self.n_classes) / targets["y"].shape[0]
+                self.outcome = (
+                    np.bincount(targets["y"], minlength=self.n_classes)
+                    / targets["y"].shape[0]
+                )
 
     def predict_row(self, row):
         """Predict single row."""
 
@@ -6,6 +6,7 @@
 from mla.base import BaseEstimator
 from mla.metrics import mean_squared_error, binary_crossentropy
 
+
 np.random.seed(9999)
 
 """
@@ -16,7 +17,14 @@
 
 class BaseFM(BaseEstimator):
     def __init__(
-        self, n_components=10, max_iter=100, init_stdev=0.1, learning_rate=0.01, reg_v=0.1, reg_w=0.5, reg_w0=0.0
+        self,
+        n_components=10,
+        max_iter=100,
+        init_stdev=0.1,
+        learning_rate=0.01,
+        reg_v=0.1,
+        reg_w=0.5,
+        reg_w0=0.0,
     ):
         """Simplified factorization machines implementation using SGD optimizer."""
         self.reg_w0 = reg_w0
@@ -36,7 +44,9 @@ def fit(self, X, y=None):
         # Feature weights
         self.w = np.zeros(self.n_features)
         # Factor weights
-        self.v = np.random.normal(scale=self.init_stdev, size=(self.n_features, self.n_components))
+        self.v = np.random.normal(
+            scale=self.init_stdev, size=(self.n_features, self.n_components)
+        )
         self._train()
 
     def _train(self):
@@ -56,7 +66,9 @@ def _factor_step(self, loss):
 
     def _predict(self, X=None):
         linear_output = np.dot(X, self.w)
-        factors_output = np.sum(np.dot(X, self.v) ** 2 - np.dot(X ** 2, self.v ** 2), axis=1) / 2.0
+        factors_output = (
+            np.sum(np.dot(X, self.v) ** 2 - np.dot(X**2, self.v**2), axis=1) / 2.0
+        )
         return self.wo + linear_output + factors_output
 
 
 
@@ -68,7 +68,9 @@ def _initialize(self):
         """
         self.weights = np.ones(self.K)
         if self.init == "random":
-            self.means = [self.X[x] for x in random.sample(range(self.n_samples), self.K)]
+            self.means = [
+                self.X[x] for x in random.sample(range(self.n_samples), self.K)
+            ]
             self.covs = [np.cov(self.X.T) for _ in range(self.K)]
 
         elif self.init == "kmeans":
@@ -106,7 +108,9 @@ def _M_step(self):
 
     def _is_converged(self):
         """Check if the difference of the latest two likelihood is less than the tolerance."""
-        if (len(self.likelihood) > 1) and (self.likelihood[-1] - self.likelihood[-2] <= self.tolerance):
+        if (len(self.likelihood) > 1) and (
+            self.likelihood[-1] - self.likelihood[-2] <= self.tolerance
+        ):
             return True
         return False
 
@@ -123,7 +127,9 @@ def _get_likelihood(self, data):
         n_data = data.shape[0]
         likelihoods = np.zeros([n_data, self.K])
         for c in range(self.K):
-            likelihoods[:, c] = multivariate_normal.pdf(data, self.means[c], self.covs[c])
+            likelihoods[:, c] = multivariate_normal.pdf(
+                data, self.means[c], self.covs[c]
+            )
         return likelihoods
 
     def _get_weighted_likelihood(self, likelihood):
@@ -151,7 +157,9 @@ def plot(self, data=None, ax=None, holdon=False):
         margin = 0.2
         xmax, ymax = self.X.max(axis=0) + margin
         xmin, ymin = self.X.min(axis=0) - margin
-        axis_X, axis_Y = np.meshgrid(np.arange(xmin, xmax, delta), np.arange(ymin, ymax, delta))
+        axis_X, axis_Y = np.meshgrid(
+            np.arange(xmin, xmax, delta), np.arange(ymin, ymax, delta)
+        )
 
         def grid_gaussian_pdf(mean, cov):
             grid_array = np.array(list(zip(axis_X.flatten(), axis_Y.flatten())))
 
@@ -53,7 +53,9 @@ def _initialize_centroids(self, init):
         """Set the initial centroids."""
 
         if init == "random":
-            self.centroids = [self.X[x] for x in random.sample(range(self.n_samples), self.K)]
+            self.centroids = [
+                self.X[x] for x in random.sample(range(self.n_samples), self.K)
+            ]
         elif init == "++":
             self.centroids = [random.choice(self.X)]
             while len(self.centroids) < self.K:
@@ -88,7 +90,6 @@ def _get_predictions(self):
         return predictions
 
     def _assign(self, centroids):
-
         for row in range(self.n_samples):
             for i, cluster in enumerate(self.clusters):
                 if row in cluster:
@@ -115,11 +116,13 @@ def _get_centroid(self, cluster):
 
     def _dist_from_centers(self):
         """Calculate distance from centers."""
-        return np.array([min([euclidean_distance(x, c) for c in self.centroids]) for x in self.X])
+        return np.array(
+            [min([euclidean_distance(x, c) for c in self.centroids]) for x in self.X]
+        )
 
     def _choose_next_center(self):
         distances = self._dist_from_centers()
-        squared_distances = distances ** 2
+        squared_distances = distances**2
         probs = squared_distances / squared_distances.sum()
         ind = np.random.choice(self.X.shape[0], 1, p=probs)[0]
         return self.X[ind]
@@ -141,7 +144,12 @@ def plot(self, ax=None, holdon=False):
 
         for i, index in enumerate(self.clusters):
             point = np.array(data[index]).T
-            ax.scatter(*point, c=[palette[i], ])
+            ax.scatter(
+                *point,
+                c=[
+                    palette[i],
+                ],
+            )
 
         for point in self.centroids:
             ax.scatter(*point, marker="x", linewidths=10)
 
@@ -40,7 +40,10 @@ def _predict_x(self, x):
         distances = (self.distance_func(x, example) for example in self.X)
 
         # Sort all examples by their distance to x and keep their target value.
-        neighbors = sorted(((dist, target) for (dist, target) in zip(distances, self.y)), key=lambda x: x[0])
+        neighbors = sorted(
+            ((dist, target) for (dist, target) in zip(distances, self.y)),
+            key=lambda x: x[0],
+        )
 
         # Get targets of the k-nn and aggregate them (most common one or
         # average).