Spark: Union Incompatible Dataframes
To union two DataFrames with incompatible number of columns:
def incompat_union(df1: DataFrame, df2: DataFrame) -> DataFrame:
"""
Union two incompatible DataFrames i.e. number of columns and order can be different.
Creates a DataFrame which contains unique columns from both, and fills with nulls missing columns in each.
:param df1:
:param df2:
"""
# take first df and add missing columns, filling them with nulls
df1u = df1
for df2c in df2.columns:
if df2c not in df1.columns:
log.info(f"'{df2c}' not in df1, adding")
df1u = (df1u.withColumn(df2c, f.lit(None)))
# take df2 and make it idential in order and columns to df1u
cols2 = []
for df1c in df1u.columns:
if df1c in df2.columns:
# add own column
cols2.append(f.col(df1c))
else:
# add a dummy
cols2.append(f.lit(None).alias(df1c))
log.info(f"'{df1c}' not in df2, adding")
df2u = df2.select(*cols2)
df = df1u.union(df2u)
# df.printSchema()
return df
To contact me, send an email anytime or leave a comment below.