Flatten Spark Dataframe in Scala
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Column, DataFrame}
class SparkFlattener {
private def flatten(schema: StructType, path: String = null, aliasPrefix: String = null): Array[Column] = {
schema.fields.flatMap(f => {
val colName = if (path == null) f.name else (f"${path}.${f.name}")
val colAlias = if (aliasPrefix == null) f.name else (f"${aliasPrefix}${f.name.capitalize}")
f.dataType match {
case st: StructType => flatten(st, colName, colAlias)
case _ => Array(col(colName).as(colAlias))
}
})
}
/**
* Flattens DataFrame i.e. transforms structs into flat columns in long name.
* Arrays are not flattened as they can't be.
*/
def flatten(df: DataFrame): DataFrame = {
val cols = flatten(df.schema)
df.select(cols: _*)
}
}
To contact me, send an email anytime or leave a comment below.