Reshuffle Schema Columns in Spark DataFrame
If you need to change order of columns inside a DataFrame
here’s a solution:
def move_col_after(df: DataFrame, col_to_move: str, col_after: str) -> DataFrame:
"""Moves a column to appear after specified column in a dataframe
:param df: Dataframe to work with
:param col_to_move: column that is to be moved
:param col_after: column to move after
:return:
"""
ordered_cols = []
moved = False
for fld in df.schema.fieldNames():
if fld != col_to_move:
ordered_cols.append(fld)
if fld == col_after:
ordered_cols.append(col_to_move)
moved = True
# in case the column wasn't found, don't lose the data
if not moved:
ordered_cols.append(col_to_move)
return df.select(*ordered_cols)
def move_cols_after(df: DataFrame, col_after: str, *cols_to_move) -> DataFrame:
"""Moves a column to appear after specified column in a dataframe
:param df: Dataframe to work with
:param col_after: column to move after
:param cols_to_move: columns to be moved
:return:
"""
if not cols_to_move:
return df
ordered_cols = []
for fld in df.schema.fieldNames():
if fld not in cols_to_move:
ordered_cols.append(fld)
if fld == col_after:
ordered_cols.extend(cols_to_move)
return df.select(*ordered_cols)
def move_col_to_position(df: DataFrame, col_name: str, pos: int) -> DataFrame:
rc = []
for i in range(0, len(df.columns)):
if i == pos:
rc.append(col_name)
col = df.columns[i]
if col != col_name:
rc.append(col)
return df.select(*rc)
The usage is self-explanatory.
To contact me, send an email anytime or leave a comment below.