To make sure it does not fail for string
, date
and timestamp
columns:
import pyspark.sql.functions as F
def count_missings(spark_df,sort=True):
"""
Counts number of nulls and nans in each column
"""
df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp', 'string', 'date')]).toPandas()
if len(df) == 0:
print("There are no any missing values!")
return None
if sort:
return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)
return df
If you want to see the columns sorted based on the number of nans and nulls in descending:
count_missings(spark_df)
# | Col_A | 10 |
# | Col_C | 2 |
# | Col_B | 1 |
If you don't want ordering and see them as a single row:
count_missings(spark_df, False)
# | Col_A | Col_B | Col_C |
# | 10 | 1 | 2 |