Spark: Union Incompatible Dataframes

To union two DataFrames with incompatible number of columns:

def incompat_union(df1: DataFrame, df2: DataFrame) -> DataFrame:
    """
    Union two incompatible DataFrames i.e. number of columns and order can be different.
    Creates a DataFrame which contains unique columns from both, and fills with nulls missing columns in each.
    :param df1:
    :param df2:
    """

    # take first df and add missing columns, filling them with nulls
    df1u = df1
    for df2c in df2.columns:
        if df2c not in df1.columns:
            log.info(f"'{df2c}' not in df1, adding")
            df1u = (df1u.withColumn(df2c, f.lit(None)))

    # take df2 and make it idential in order and columns to df1u
    cols2 = []
    for df1c in df1u.columns:
        if df1c in df2.columns:
            # add own column
            cols2.append(f.col(df1c))
        else:
            # add a dummy
            cols2.append(f.lit(None).alias(df1c))
            log.info(f"'{df1c}' not in df2, adding")
    df2u = df2.select(*cols2)
    df = df1u.union(df2u)

    # df.printSchema()
    return df


To contact me, send an email anytime or leave a comment below.