Flatten Spark Dataframe in Scala

import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Column, DataFrame}

class SparkFlattener {

  private def flatten(schema: StructType, path: String = null, aliasPrefix: String = null): Array[Column] =   {
    schema.fields.flatMap(f => {
      val colName = if (path == null) f.name else (f"${path}.${f.name}")
      val colAlias = if (aliasPrefix == null) f.name else (f"${aliasPrefix}${f.name.capitalize}")

      f.dataType match {
        case st: StructType => flatten(st, colName, colAlias)
        case _ => Array(col(colName).as(colAlias))
      }
    })
  }

  /**
   * Flattens DataFrame i.e. transforms structs into flat columns in long name.
   * Arrays are not flattened as they can't be.
   */
  def flatten(df: DataFrame): DataFrame = {
    val cols = flatten(df.schema)

    df.select(cols: _*)
  }
}


To contact me, send an email anytime or leave a comment below.