[pyspark] Join two data frames, select all columns from one and some columns from the other

function to drop duplicate columns after joining.

check it

def dropDupeDfCols(df): newcols = [] dupcols = []

for i in range(len(df.columns)):
    if df.columns[i] not in newcols:
        newcols.append(df.columns[i])
    else:
        dupcols.append(i)

df = df.toDF(*[str(i) for i in range(len(df.columns))])
for dupcol in dupcols:
    df = df.drop(str(dupcol))

return df.toDF(*newcols)