public class RDDConverterUtilsExt extends Object
Modifier and Type | Class and Description |
---|---|
static class |
RDDConverterUtilsExt.AddRowID |
static class |
RDDConverterUtilsExt.RDDConverterTypes |
Constructor and Description |
---|
RDDConverterUtilsExt() |
Modifier and Type | Method and Description |
---|---|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
addIDToDataFrame(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df,
org.apache.spark.sql.SparkSession sparkSession,
String nameOfCol)
Add element indices as new column to DataFrame
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
addIDToDataFrame(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df,
org.apache.spark.sql.SQLContext sqlContext,
String nameOfCol)
Deprecated.
This will be removed in SystemML 1.0.
|
static byte[] |
convertMBtoPy4JDenseArr(MatrixBlock mb) |
static MatrixBlock |
convertPy4JArrayToMB(byte[] data,
int rlen,
int clen) |
static MatrixBlock |
convertPy4JArrayToMB(byte[] data,
int rlen,
int clen,
boolean isSparse) |
static MatrixBlock |
convertPy4JArrayToMB(byte[] data,
long rlen,
long clen) |
static MatrixBlock |
convertPy4JArrayToMB(byte[] data,
long rlen,
long clen,
boolean isSparse) |
static MatrixBlock |
convertSciPyCOOToMB(byte[] data,
byte[] row,
byte[] col,
int rlen,
int clen,
int nnz) |
static MatrixBlock |
convertSciPyCOOToMB(byte[] data,
byte[] row,
byte[] col,
long rlen,
long clen,
long nnz) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
coordinateMatrixToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.mllib.linalg.distributed.CoordinateMatrix input,
MatrixCharacteristics mcIn,
boolean outputEmptyBlocks)
Example usage:
|
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
coordinateMatrixToBinaryBlock(org.apache.spark.SparkContext sc,
org.apache.spark.mllib.linalg.distributed.CoordinateMatrix input,
MatrixCharacteristics mcIn,
boolean outputEmptyBlocks) |
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
projectColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df,
ArrayList<String> columns) |
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
stringDataFrameToVectorDataFrame(org.apache.spark.sql.SparkSession sparkSession,
org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> inputDF)
Convert a dataframe of comma-separated string rows to a dataframe of
ml.linalg.Vector rows.
|
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
stringDataFrameToVectorDataFrame(org.apache.spark.sql.SQLContext sqlContext,
org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> inputDF)
Deprecated.
This will be removed in SystemML 1.0. Please migrate to
RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(SparkSession, Dataset<Row>) |
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> coordinateMatrixToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.mllib.linalg.distributed.CoordinateMatrix input, MatrixCharacteristics mcIn, boolean outputEmptyBlocks) throws DMLRuntimeException
import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt
import org.apache.sysml.runtime.matrix.MatrixCharacteristics
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
val matRDD = sc.textFile("ratings.text").map(_.split(" ")).map(x => new MatrixEntry(x(0).toLong, x(1).toLong, x(2).toDouble)).filter(_.value != 0).cache
require(matRDD.filter(x => x.i == 0 || x.j == 0).count == 0, "Expected 1-based ratings file")
val nnz = matRDD.count
val numRows = matRDD.map(_.i).max
val numCols = matRDD.map(_.j).max
val coordinateMatrix = new CoordinateMatrix(matRDD, numRows, numCols)
val mc = new MatrixCharacteristics(numRows, numCols, 1000, 1000, nnz)
val binBlocks = RDDConverterUtilsExt.coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), coordinateMatrix, mc, true)
sc
- java spark contextinput
- coordinate matrixmcIn
- matrix characteristicsoutputEmptyBlocks
- if true, inject empty blocks if necessaryJavaPairRDD<MatrixIndexes, MatrixBlock>
DMLRuntimeException
- if DMLRuntimeException occurspublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> coordinateMatrixToBinaryBlock(org.apache.spark.SparkContext sc, org.apache.spark.mllib.linalg.distributed.CoordinateMatrix input, MatrixCharacteristics mcIn, boolean outputEmptyBlocks) throws DMLRuntimeException
DMLRuntimeException
public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> projectColumns(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, ArrayList<String> columns) throws DMLRuntimeException
DMLRuntimeException
public static MatrixBlock convertPy4JArrayToMB(byte[] data, long rlen, long clen) throws DMLRuntimeException
DMLRuntimeException
public static MatrixBlock convertPy4JArrayToMB(byte[] data, int rlen, int clen) throws DMLRuntimeException
DMLRuntimeException
public static MatrixBlock convertSciPyCOOToMB(byte[] data, byte[] row, byte[] col, long rlen, long clen, long nnz) throws DMLRuntimeException
DMLRuntimeException
public static MatrixBlock convertSciPyCOOToMB(byte[] data, byte[] row, byte[] col, int rlen, int clen, int nnz) throws DMLRuntimeException
DMLRuntimeException
public static MatrixBlock convertPy4JArrayToMB(byte[] data, long rlen, long clen, boolean isSparse) throws DMLRuntimeException
DMLRuntimeException
public static MatrixBlock convertPy4JArrayToMB(byte[] data, int rlen, int clen, boolean isSparse) throws DMLRuntimeException
DMLRuntimeException
public static byte[] convertMBtoPy4JDenseArr(MatrixBlock mb) throws DMLRuntimeException
DMLRuntimeException
public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> addIDToDataFrame(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, org.apache.spark.sql.SparkSession sparkSession, String nameOfCol)
df
- input data framesparkSession
- the Spark SessionnameOfCol
- name of index column@Deprecated public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> addIDToDataFrame(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, org.apache.spark.sql.SQLContext sqlContext, String nameOfCol)
df
- input data framesqlContext
- the SQL ContextnameOfCol
- name of index column@Deprecated public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> stringDataFrameToVectorDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> inputDF) throws DMLRuntimeException
RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(SparkSession, Dataset<Row>)
Example input rows:
((1.2, 4.3, 3.4))
(1.2, 3.4, 2.2)
[[1.2, 34.3, 1.2, 1.25]]
[1.2, 3.4]
sqlContext
- Spark SQL ContextinputDF
- dataframe of comma-separated row strings to convert to
dataframe of ml.linalg.Vector rowsDMLRuntimeException
- if DMLRuntimeException occurspublic static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> stringDataFrameToVectorDataFrame(org.apache.spark.sql.SparkSession sparkSession, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> inputDF) throws DMLRuntimeException
Example input rows:
((1.2, 4.3, 3.4))
(1.2, 3.4, 2.2)
[[1.2, 34.3, 1.2, 1.25]]
[1.2, 3.4]
sparkSession
- Spark SessioninputDF
- dataframe of comma-separated row strings to convert to
dataframe of ml.linalg.Vector rowsDMLRuntimeException
- if DMLRuntimeException occursCopyright © 2017 The Apache Software Foundation. All rights reserved.