Get S3 filesystem details using PySpark
Unfortunately Spark doesn’t give you information about underlying files - the deepest function to get any info is input_file_name but that’s it. But you can do that using HDFS API, here is a function I wrote
from pyspark.sql import DataFrame, SparkSession, Column
def list_files_with_hdfs(spark: SparkSession, path_root: str):
sc = spark.sparkContext
java_path = sc._jvm.java.net.URI.create(path_root)
hadoop_path = sc._jvm.org.apache.hadoop.fs.Path(path_root)
hadoop_file_system = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
java_path,
sc._jvm.org.apache.hadoop.conf.Configuration())
iterator = hadoop_file_system.listFiles(hadoop_path, True)
result = []
while iterator.hasNext():
item = iterator.next()
el = {
"path": item.getPath().toString(),
"length": item.getLen(),
"modTime": item.getModificationTime()
}
result.append(el)
if len(result) % 1000 == 0:
print(f"{len(result)}. ", end="")
return result
the modTime
is a long
number, so you can prettify it as well:
fis = list_files_with_hdfs(spark, "s3://a_bucket/")
df = spark.createDataFrame(fis)
df = (df
.select(df.path,
df.length.alias("lengthBytes"),
(df.modTime / 1000).cast("timestamp").alias("modTime") ))
display(df)
To contact me, send an email anytime or leave a comment below.