Get S3 filesystem details using PySpark

Unfortunately Spark doesn’t give you information about underlying files - the deepest function to get any info is input_file_name but that’s it. But you can do that using HDFS API, here is a function I wrote

from pyspark.sql import DataFrame, SparkSession, Column

def list_files_with_hdfs(spark: SparkSession, path_root: str):
    sc = spark.sparkContext
    java_path = sc._jvm.java.net.URI.create(path_root)
    hadoop_path = sc._jvm.org.apache.hadoop.fs.Path(path_root)
    hadoop_file_system = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        java_path,
        sc._jvm.org.apache.hadoop.conf.Configuration())
    iterator = hadoop_file_system.listFiles(hadoop_path, True)

    result = []
    while iterator.hasNext():
        item = iterator.next()
        el = {
            "path": item.getPath().toString(),
            "length": item.getLen(),
            "modTime": item.getModificationTime()
        }
        result.append(el)
        if len(result) % 1000 == 0:
          print(f"{len(result)}. ", end="")

    return result

the modTime is a long number, so you can prettify it as well:

fis = list_files_with_hdfs(spark, "s3://a_bucket/")

df = spark.createDataFrame(fis)

df = (df
     .select(df.path,
             df.length.alias("lengthBytes"),
             (df.modTime / 1000).cast("timestamp").alias("modTime") ))

display(df)


To contact me, send an email anytime or leave a comment below.