systemml.mllearn package

Submodules

systemml.mllearn.estimators module

class systemml.mllearn.estimators.LinearRegression(sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=1e-06, C=inf, solver='newton-cg', transferUsingDF=False)

Bases: systemml.mllearn.estimators.BaseSystemMLRegressor

Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.

Examples

>>> import numpy as np
>>> from sklearn import datasets
>>> from systemml.mllearn import LinearRegression
>>> from pyspark.sql import SparkSession
>>> # Load the diabetes dataset
>>> diabetes = datasets.load_diabetes()
>>> # Use only one feature
>>> diabetes_X = diabetes.data[:, np.newaxis, 2]
>>> # Split the data into training/testing sets
>>> diabetes_X_train = diabetes_X[:-20]
>>> diabetes_X_test = diabetes_X[-20:]
>>> # Split the targets into training/testing sets
>>> diabetes_y_train = diabetes.target[:-20]
>>> diabetes_y_test = diabetes.target[-20:]
>>> # Create linear regression object
>>> regr = LinearRegression(sparkSession, solver='newton-cg')
>>> # Train the model using the training sets
>>> regr.fit(diabetes_X_train, diabetes_y_train)
>>> # The mean square error
>>> print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
class systemml.mllearn.estimators.LogisticRegression(sparkSession, penalty='l2', fit_intercept=True, normalize=False, max_iter=100, max_inner_iter=0, tol=1e-06, C=1.0, solver='newton-cg', transferUsingDF=False)

Bases: systemml.mllearn.estimators.BaseSystemMLClassifier

Performs both binomial and multinomial logistic regression.

Examples

Scikit-learn way

>>> from sklearn import datasets, neighbors
>>> from systemml.mllearn import LogisticRegression
>>> from pyspark.sql import SparkSession
>>> sparkSession = SparkSession.builder.getOrCreate()
>>> digits = datasets.load_digits()
>>> X_digits = digits.data
>>> y_digits = digits.target + 1
>>> n_samples = len(X_digits)
>>> X_train = X_digits[:.9 * n_samples]
>>> y_train = y_digits[:.9 * n_samples]
>>> X_test = X_digits[.9 * n_samples:]
>>> y_test = y_digits[.9 * n_samples:]
>>> logistic = LogisticRegression(sparkSession)
>>> print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))

MLPipeline way

>>> from pyspark.ml import Pipeline
>>> from systemml.mllearn import LogisticRegression
>>> from pyspark.ml.feature import HashingTF, Tokenizer
>>> from pyspark.sql import SparkSession
>>> sparkSession = SparkSession.builder.getOrCreate()
>>> training = sparkSession.createDataFrame([
>>>     (0L, "a b c d e spark", 1.0),
>>>     (1L, "b d", 2.0),
>>>     (2L, "spark f g h", 1.0),
>>>     (3L, "hadoop mapreduce", 2.0),
>>>     (4L, "b spark who", 1.0),
>>>     (5L, "g d a y", 2.0),
>>>     (6L, "spark fly", 1.0),
>>>     (7L, "was mapreduce", 2.0),
>>>     (8L, "e spark program", 1.0),
>>>     (9L, "a e c l", 2.0),
>>>     (10L, "spark compile", 1.0),
>>>     (11L, "hadoop software", 2.0)
>>> ], ["id", "text", "label"])
>>> tokenizer = Tokenizer(inputCol="text", outputCol="words")
>>> hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
>>> lr = LogisticRegression(sparkSession)
>>> pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
>>> model = pipeline.fit(training)
>>> test = sparkSession.createDataFrame([
>>>     (12L, "spark i j k"),
>>>     (13L, "l m n"),
>>>     (14L, "mapreduce spark"),
>>>     (15L, "apache hadoop")], ["id", "text"])
>>> prediction = model.transform(test)
>>> prediction.show()
class systemml.mllearn.estimators.SVM(sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=1e-06, C=1.0, is_multi_class=False, transferUsingDF=False)

Bases: systemml.mllearn.estimators.BaseSystemMLClassifier

Performs both binary-class and multiclass SVM (Support Vector Machines).

Examples

>>> from sklearn import datasets, neighbors
>>> from systemml.mllearn import SVM
>>> from pyspark.sql import SparkSession
>>> sparkSession = SparkSession.builder.getOrCreate()
>>> digits = datasets.load_digits()
>>> X_digits = digits.data
>>> y_digits = digits.target
>>> n_samples = len(X_digits)
>>> X_train = X_digits[:.9 * n_samples]
>>> y_train = y_digits[:.9 * n_samples]
>>> X_test = X_digits[.9 * n_samples:]
>>> y_test = y_digits[.9 * n_samples:]
>>> svm = SVM(sparkSession, is_multi_class=True)
>>> print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))
class systemml.mllearn.estimators.NaiveBayes(sparkSession, laplace=1.0, transferUsingDF=False)

Bases: systemml.mllearn.estimators.BaseSystemMLClassifier

Performs Naive Bayes.

Examples

>>> from sklearn.datasets import fetch_20newsgroups
>>> from sklearn.feature_extraction.text import TfidfVectorizer
>>> from systemml.mllearn import NaiveBayes
>>> from sklearn import metrics
>>> from pyspark.sql import SparkSession
>>> sparkSession = SparkSession.builder.getOrCreate(sc)
>>> categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
>>> newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
>>> newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
>>> vectorizer = TfidfVectorizer()
>>> # Both vectors and vectors_test are SciPy CSR matrix
>>> vectors = vectorizer.fit_transform(newsgroups_train.data)
>>> vectors_test = vectorizer.transform(newsgroups_test.data)
>>> nb = NaiveBayes(sparkSession)
>>> nb.fit(vectors, newsgroups_train.target)
>>> pred = nb.predict(vectors_test)
>>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
class systemml.mllearn.estimators.Caffe2DML(sparkSession, solver, input_shape, transferUsingDF=False, tensorboard_log_dir=None)

Bases: systemml.mllearn.estimators.BaseSystemMLClassifier

Performs training/prediction for a given caffe network.

Examples

>>> from systemml.mllearn import Caffe2DML
>>> from mlxtend.data import mnist_data
>>> import numpy as np
>>> from sklearn.utils import shuffle
>>> X, y = mnist_data()
>>> X, y = shuffle(X, y)
>>> imgShape = (1, 28, 28)
>>> import urllib
>>> urllib.urlretrieve('https://raw.githubusercontent.com/niketanpansare/model_zoo/master/caffe/vision/lenet/mnist/lenet.proto', 'lenet.proto')
>>> urllib.urlretrieve('https://raw.githubusercontent.com/niketanpansare/model_zoo/master/caffe/vision/lenet/mnist/lenet_solver.proto', 'lenet_solver.proto')
>>> caffe2DML = Caffe2DML(spark, 'lenet_solver.proto').set(max_iter=500)
>>> caffe2DML.fit(X, y)
load(weights=None, sep='/', ignore_weights=None, eager=False)

Load a pretrained model.

Parameters:
  • weights (directory whether learned weights are stored (default: None)) –
  • sep (seperator to use (default: '/')) –
  • ignore_weights (names of layers to not read from the weights directory (list of string, default:None)) –
  • eager (load the model eagerly (default: False)) –
set(debug=None, train_algo=None, test_algo=None, parallel_batches=None, output_activations=None)

Set input to Caffe2DML

Parameters:
  • debug (to add debugging DML code such as classification report, print DML script, etc (default: False)) –
  • train_algo (can be minibatch, batch, allreduce_parallel_batches or allreduce (default: minibatch)) –
  • test_algo (can be minibatch, batch, allreduce_parallel_batches or allreduce (default: minibatch)) –
  • parallel_batches (number of parallel batches) –
  • output_activations ((developer flag) directory to output activations of each layer as csv while prediction. To be used only in batch mode (default: None)) –
summary()

Print the summary of the network

visualize(layerName=None, varType='weight', aggFn='mean')

Use this to visualize the training procedure (requires validation_percentage to be non-zero). When one provides no argument to this method, we visualize training and validation loss.

Parameters:
  • layerName (Name of the layer in the Caffe prototype) –
  • varType (should be either 'weight', 'bias', 'dweight', 'dbias', 'output' or 'doutput') –
  • aggFn (should be either 'sum', 'mean', 'var' or 'sd') –

Module contents

SystemML Algorithms

Classification Algorithms
LogisticRegression Performs binomial and multinomial logistic regression
SVM Performs both binary-class and multi-class SVM
NaiveBayes Multinomial naive bayes classifier
Regression Algorithms
LinearRegression Performs linear regression
class systemml.mllearn.LinearRegression(sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=1e-06, C=inf, solver='newton-cg', transferUsingDF=False)

Bases: systemml.mllearn.estimators.BaseSystemMLRegressor

Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.

Examples

>>> import numpy as np
>>> from sklearn import datasets
>>> from systemml.mllearn import LinearRegression
>>> from pyspark.sql import SparkSession
>>> # Load the diabetes dataset
>>> diabetes = datasets.load_diabetes()
>>> # Use only one feature
>>> diabetes_X = diabetes.data[:, np.newaxis, 2]
>>> # Split the data into training/testing sets
>>> diabetes_X_train = diabetes_X[:-20]
>>> diabetes_X_test = diabetes_X[-20:]
>>> # Split the targets into training/testing sets
>>> diabetes_y_train = diabetes.target[:-20]
>>> diabetes_y_test = diabetes.target[-20:]
>>> # Create linear regression object
>>> regr = LinearRegression(sparkSession, solver='newton-cg')
>>> # Train the model using the training sets
>>> regr.fit(diabetes_X_train, diabetes_y_train)
>>> # The mean square error
>>> print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
class systemml.mllearn.LogisticRegression(sparkSession, penalty='l2', fit_intercept=True, normalize=False, max_iter=100, max_inner_iter=0, tol=1e-06, C=1.0, solver='newton-cg', transferUsingDF=False)

Bases: systemml.mllearn.estimators.BaseSystemMLClassifier

Performs both binomial and multinomial logistic regression.

Examples

Scikit-learn way

>>> from sklearn import datasets, neighbors
>>> from systemml.mllearn import LogisticRegression
>>> from pyspark.sql import SparkSession
>>> sparkSession = SparkSession.builder.getOrCreate()
>>> digits = datasets.load_digits()
>>> X_digits = digits.data
>>> y_digits = digits.target + 1
>>> n_samples = len(X_digits)
>>> X_train = X_digits[:.9 * n_samples]
>>> y_train = y_digits[:.9 * n_samples]
>>> X_test = X_digits[.9 * n_samples:]
>>> y_test = y_digits[.9 * n_samples:]
>>> logistic = LogisticRegression(sparkSession)
>>> print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))

MLPipeline way

>>> from pyspark.ml import Pipeline
>>> from systemml.mllearn import LogisticRegression
>>> from pyspark.ml.feature import HashingTF, Tokenizer
>>> from pyspark.sql import SparkSession
>>> sparkSession = SparkSession.builder.getOrCreate()
>>> training = sparkSession.createDataFrame([
>>>     (0L, "a b c d e spark", 1.0),
>>>     (1L, "b d", 2.0),
>>>     (2L, "spark f g h", 1.0),
>>>     (3L, "hadoop mapreduce", 2.0),
>>>     (4L, "b spark who", 1.0),
>>>     (5L, "g d a y", 2.0),
>>>     (6L, "spark fly", 1.0),
>>>     (7L, "was mapreduce", 2.0),
>>>     (8L, "e spark program", 1.0),
>>>     (9L, "a e c l", 2.0),
>>>     (10L, "spark compile", 1.0),
>>>     (11L, "hadoop software", 2.0)
>>> ], ["id", "text", "label"])
>>> tokenizer = Tokenizer(inputCol="text", outputCol="words")
>>> hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
>>> lr = LogisticRegression(sparkSession)
>>> pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
>>> model = pipeline.fit(training)
>>> test = sparkSession.createDataFrame([
>>>     (12L, "spark i j k"),
>>>     (13L, "l m n"),
>>>     (14L, "mapreduce spark"),
>>>     (15L, "apache hadoop")], ["id", "text"])
>>> prediction = model.transform(test)
>>> prediction.show()
class systemml.mllearn.SVM(sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=1e-06, C=1.0, is_multi_class=False, transferUsingDF=False)

Bases: systemml.mllearn.estimators.BaseSystemMLClassifier

Performs both binary-class and multiclass SVM (Support Vector Machines).

Examples

>>> from sklearn import datasets, neighbors
>>> from systemml.mllearn import SVM
>>> from pyspark.sql import SparkSession
>>> sparkSession = SparkSession.builder.getOrCreate()
>>> digits = datasets.load_digits()
>>> X_digits = digits.data
>>> y_digits = digits.target
>>> n_samples = len(X_digits)
>>> X_train = X_digits[:.9 * n_samples]
>>> y_train = y_digits[:.9 * n_samples]
>>> X_test = X_digits[.9 * n_samples:]
>>> y_test = y_digits[.9 * n_samples:]
>>> svm = SVM(sparkSession, is_multi_class=True)
>>> print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))
class systemml.mllearn.NaiveBayes(sparkSession, laplace=1.0, transferUsingDF=False)

Bases: systemml.mllearn.estimators.BaseSystemMLClassifier

Performs Naive Bayes.

Examples

>>> from sklearn.datasets import fetch_20newsgroups
>>> from sklearn.feature_extraction.text import TfidfVectorizer
>>> from systemml.mllearn import NaiveBayes
>>> from sklearn import metrics
>>> from pyspark.sql import SparkSession
>>> sparkSession = SparkSession.builder.getOrCreate(sc)
>>> categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
>>> newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
>>> newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
>>> vectorizer = TfidfVectorizer()
>>> # Both vectors and vectors_test are SciPy CSR matrix
>>> vectors = vectorizer.fit_transform(newsgroups_train.data)
>>> vectors_test = vectorizer.transform(newsgroups_test.data)
>>> nb = NaiveBayes(sparkSession)
>>> nb.fit(vectors, newsgroups_train.target)
>>> pred = nb.predict(vectors_test)
>>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
class systemml.mllearn.Caffe2DML(sparkSession, solver, input_shape, transferUsingDF=False, tensorboard_log_dir=None)

Bases: systemml.mllearn.estimators.BaseSystemMLClassifier

Performs training/prediction for a given caffe network.

Examples

>>> from systemml.mllearn import Caffe2DML
>>> from mlxtend.data import mnist_data
>>> import numpy as np
>>> from sklearn.utils import shuffle
>>> X, y = mnist_data()
>>> X, y = shuffle(X, y)
>>> imgShape = (1, 28, 28)
>>> import urllib
>>> urllib.urlretrieve('https://raw.githubusercontent.com/niketanpansare/model_zoo/master/caffe/vision/lenet/mnist/lenet.proto', 'lenet.proto')
>>> urllib.urlretrieve('https://raw.githubusercontent.com/niketanpansare/model_zoo/master/caffe/vision/lenet/mnist/lenet_solver.proto', 'lenet_solver.proto')
>>> caffe2DML = Caffe2DML(spark, 'lenet_solver.proto').set(max_iter=500)
>>> caffe2DML.fit(X, y)
load(weights=None, sep='/', ignore_weights=None, eager=False)

Load a pretrained model.

Parameters:
  • weights (directory whether learned weights are stored (default: None)) –
  • sep (seperator to use (default: '/')) –
  • ignore_weights (names of layers to not read from the weights directory (list of string, default:None)) –
  • eager (load the model eagerly (default: False)) –
set(debug=None, train_algo=None, test_algo=None, parallel_batches=None, output_activations=None)

Set input to Caffe2DML

Parameters:
  • debug (to add debugging DML code such as classification report, print DML script, etc (default: False)) –
  • train_algo (can be minibatch, batch, allreduce_parallel_batches or allreduce (default: minibatch)) –
  • test_algo (can be minibatch, batch, allreduce_parallel_batches or allreduce (default: minibatch)) –
  • parallel_batches (number of parallel batches) –
  • output_activations ((developer flag) directory to output activations of each layer as csv while prediction. To be used only in batch mode (default: None)) –
summary()

Print the summary of the network

visualize(layerName=None, varType='weight', aggFn='mean')

Use this to visualize the training procedure (requires validation_percentage to be non-zero). When one provides no argument to this method, we visualize training and validation loss.

Parameters:
  • layerName (Name of the layer in the Caffe prototype) –
  • varType (should be either 'weight', 'bias', 'dweight', 'dbias', 'output' or 'doutput') –
  • aggFn (should be either 'sum', 'mean', 'var' or 'sd') –