I am working on a script to recursively go through subfolders in a mainfolder and build a list off a certain file type. I am having an issue with the script. Its currently set as follows
for root, subFolder, files in os.walk(PATH):
for item in files:
if item.endswith(".txt") :
fileNamePath = str(os.path.join(root,subFolder,item))
the problem is that the subFolder variable is pulling in a list of subfolders rather than the folder that the ITEM file is located. I was thinking of running a for loop for the subfolder before and join the first part of the path but I figured Id double check to see if anyone has any suggestions before that. Thanks for your help!
I will translate John La Rooy's list comprehension to nested for's, just in case anyone else has trouble understanding it.
result = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.txt'))]
Should be equivalent to:
import glob
import os
result = []
for x in os.walk(PATH):
for y in glob.glob(os.path.join(x[0], '*.txt')):
result.append(y)
Here's the documentation for list comprehension and the functions os.walk and glob.glob.
Changed in Python 3.5: Support for recursive globs using “**”.
glob.glob()
got a new recursive parameter.
If you want to get every .txt
file under my_path
(recursively including subdirs):
import glob
files = glob.glob(my_path + '/**/*.txt', recursive=True)
# my_path/ the dir
# **/ every file and dir under my_path
# *.txt every file that ends with '.txt'
If you need an iterator you can use iglob as an alternative:
for file in glob.iglob(my_path, recursive=False):
# ...
Recursive is new in Python 3.5, so it won't work on Python 2.7. Here is the example that uses r
strings so you just need to provide the path as is on either Win, Lin, ...
import glob
mypath=r"C:\Users\dj\Desktop\nba"
files = glob.glob(mypath + r'\**\*.py', recursive=True)
# print(files) # as list
for f in files:
print(f) # nice looking single line per file
Note: It will list all files, no matter how deep it should go.
You should be using the dirpath
which you call root
. The dirnames
are supplied so you can prune it if there are folders that you don't wish os.walk
to recurse into.
import os
result = [os.path.join(dp, f) for dp, dn, filenames in os.walk(PATH) for f in filenames if os.path.splitext(f)[1] == '.txt']
Edit:
After the latest downvote, it occurred to me that glob
is a better tool for selecting by extension.
import os
from glob import glob
result = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.txt'))]
Also a generator version
from itertools import chain
result = (chain.from_iterable(glob(os.path.join(x[0], '*.txt')) for x in os.walk('.')))
Edit2 for Python 3.4+
from pathlib import Path
result = list(Path(".").rglob("*.[tT][xX][tT]"))
If you don't mind installing an additional light library, you can do this:
pip install plazy
Usage:
import plazy
txt_filter = lambda x : True if x.endswith('.txt') else False
files = plazy.list_files(root='data', filter_func=txt_filter, is_include_root=True)
The result should look something like this:
['data/a.txt', 'data/b.txt', 'data/sub_dir/c.txt']
It works on both Python 2.7 and Python 3.
Github: https://github.com/kyzas/plazy#list-files
Disclaimer: I'm an author of plazy
.
Your original solution was very nearly correct, but the variable "root" is dynamically updated as it recursively paths around. os.walk() is a recursive generator. Each tuple set of (root, subFolder, files) is for a specific root the way you have it setup.
i.e.
root = 'C:\\'
subFolder = ['Users', 'ProgramFiles', 'ProgramFiles (x86)', 'Windows', ...]
files = ['foo1.txt', 'foo2.txt', 'foo3.txt', ...]
root = 'C:\\Users\\'
subFolder = ['UserAccount1', 'UserAccount2', ...]
files = ['bar1.txt', 'bar2.txt', 'bar3.txt', ...]
...
I made a slight tweak to your code to print a full list.
import os
for root, subFolder, files in os.walk(PATH):
for item in files:
if item.endswith(".txt") :
fileNamePath = str(os.path.join(root,item))
print(fileNamePath)
Hope this helps!
EDIT: (based on feeback)
OP misunderstood/mislabeled the subFolder variable, as it is actually all the sub folders in "root". Because of this, OP, you're trying to do os.path.join(str, list, str), which probably doesn't work out like you expected.
To help add clarity, you could try this labeling scheme:
import os
for current_dir_path, current_subdirs, current_files in os.walk(RECURSIVE_ROOT):
for aFile in current_files:
if aFile.endswith(".txt") :
txt_file_path = str(os.path.join(current_dir_path, aFile))
print(txt_file_path)
Its not the most pythonic answer, but I'll put it here for fun because it's a neat lesson in recursion
def find_files( files, dirs=[], extensions=[]):
new_dirs = []
for d in dirs:
try:
new_dirs += [ os.path.join(d, f) for f in os.listdir(d) ]
except OSError:
if os.path.splitext(d)[1] in extensions:
files.append(d)
if new_dirs:
find_files(files, new_dirs, extensions )
else:
return
On my machine I have two folders, root
and root2
mender@multivax ]ls -R root root2
root:
temp1 temp2
root/temp1:
temp1.1 temp1.2
root/temp1/temp1.1:
f1.mid
root/temp1/temp1.2:
f.mi f.mid
root/temp2:
tmp.mid
root2:
dummie.txt temp3
root2/temp3:
song.mid
Lets say I want to find all .txt
and all .mid
files in either of these directories, then I can just do
files = []
find_files( files, dirs=['root','root2'], extensions=['.mid','.txt'] )
print(files)
#['root2/dummie.txt',
# 'root/temp2/tmp.mid',
# 'root2/temp3/song.mid',
# 'root/temp1/temp1.1/f1.mid',
# 'root/temp1/temp1.2/f.mid']
You can use the "recursive" setting within glob module to search through subdirectories
For example:
import glob
glob.glob('//Mypath/folder/**/*',recursive = True)
The second line would return all files within subdirectories for that folder location (Note, you need the '**/*' string at the end of your folder string to do this.)
If you specifically wanted to find text files deep within your subdirectories, you can use
glob.glob('//Mypath/folder/**/*.txt',recursive = True)
This function will recursively put only files into a list.
import os
def ls_files(dir):
files = list()
for item in os.listdir(dir):
abspath = os.path.join(dir, item)
try:
if os.path.isdir(abspath):
files = files + ls_files(abspath)
else:
files.append(abspath)
except FileNotFoundError as err:
print('invalid directory\n', 'Error: ', err)
return files
The new pathlib
library simplifies this to one line:
from pathlib import Path
result = list(Path(PATH).glob('**/*.txt'))
You can also use the generator version:
from pathlib import Path
for file in Path(PATH).glob('**/*.txt'):
pass
This returns Path
objects, which you can use for pretty much anything, or get the file name as a string by file.name
.
This seems to be the fastest solution I could come up with, and is faster than os.walk
and a lot faster than any glob
solution.
f.path
to f.name
(do not change it for subfolders!).Args: dir: str, ext: list
.
Function returns two lists: subfolders, files
.
See below for a detailed speed anaylsis.
def run_fast_scandir(dir, ext): # dir: str, ext: list
subfolders, files = [], []
for f in os.scandir(dir):
if f.is_dir():
subfolders.append(f.path)
if f.is_file():
if os.path.splitext(f.name)[1].lower() in ext:
files.append(f.path)
for dir in list(subfolders):
sf, f = run_fast_scandir(dir, ext)
subfolders.extend(sf)
files.extend(f)
return subfolders, files
subfolders, files = run_fast_scandir(folder, [".jpg"])
In case you need the file size, you can also create a sizes
list and add f.stat().st_size
like this for a display of MiB:
sizes.append(f"{f.stat().st_size/1024/1024:.0f} MiB")
Speed analysis
for various methods to get all files with a specific file extension inside all subfolders and the main folder.
tl;dr:
fast_scandir
clearly wins and is twice as fast as all other solutions, except os.walk.os.walk
is second place slighly slower.glob
will greatly slow down the process.fast_scandir took 499 ms. Found files: 16596. Found subfolders: 439
os.walk took 589 ms. Found files: 16596
find_files took 919 ms. Found files: 16596
glob.iglob took 998 ms. Found files: 16596
glob.glob took 1002 ms. Found files: 16596
pathlib.rglob took 1041 ms. Found files: 16596
os.walk-glob took 1043 ms. Found files: 16596
Tests were done with W7x64, Python 3.8.1, 20 runs. 16596 files in 439 (partially nested) subfolders.
find_files
is from https://stackoverflow.com/a/45646357/2441026 and lets you search for several extensions.
fast_scandir
was written by myself and will also return a list of subfolders. You can give it a list of extensions to search for (I tested a list with one entry to a simple if ... == ".jpg"
and there was no significant difference).
# -*- coding: utf-8 -*-
# Python 3
import time
import os
from glob import glob, iglob
from pathlib import Path
directory = r"<folder>"
RUNS = 20
def run_os_walk():
a = time.time_ns()
for i in range(RUNS):
fu = [os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if
os.path.splitext(f)[1].lower() == '.jpg']
print(f"os.walk\t\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")
def run_os_walk_glob():
a = time.time_ns()
for i in range(RUNS):
fu = [y for x in os.walk(directory) for y in glob(os.path.join(x[0], '*.jpg'))]
print(f"os.walk-glob\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")
def run_glob():
a = time.time_ns()
for i in range(RUNS):
fu = glob(os.path.join(directory, '**', '*.jpg'), recursive=True)
print(f"glob.glob\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")
def run_iglob():
a = time.time_ns()
for i in range(RUNS):
fu = list(iglob(os.path.join(directory, '**', '*.jpg'), recursive=True))
print(f"glob.iglob\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")
def run_pathlib_rglob():
a = time.time_ns()
for i in range(RUNS):
fu = list(Path(directory).rglob("*.jpg"))
print(f"pathlib.rglob\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}")
def find_files(files, dirs=[], extensions=[]):
# https://stackoverflow.com/a/45646357/2441026
new_dirs = []
for d in dirs:
try:
new_dirs += [ os.path.join(d, f) for f in os.listdir(d) ]
except OSError:
if os.path.splitext(d)[1].lower() in extensions:
files.append(d)
if new_dirs:
find_files(files, new_dirs, extensions )
else:
return
def run_fast_scandir(dir, ext): # dir: str, ext: list
# https://stackoverflow.com/a/59803793/2441026
subfolders, files = [], []
for f in os.scandir(dir):
if f.is_dir():
subfolders.append(f.path)
if f.is_file():
if os.path.splitext(f.name)[1].lower() in ext:
files.append(f.path)
for dir in list(subfolders):
sf, f = run_fast_scandir(dir, ext)
subfolders.extend(sf)
files.extend(f)
return subfolders, files
if __name__ == '__main__':
run_os_walk()
run_os_walk_glob()
run_glob()
run_iglob()
run_pathlib_rglob()
a = time.time_ns()
for i in range(RUNS):
files = []
find_files(files, dirs=[directory], extensions=[".jpg"])
print(f"find_files\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(files)}")
a = time.time_ns()
for i in range(RUNS):
subf, files = run_fast_scandir(directory, [".jpg"])
print(f"fast_scandir\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(files)}. Found subfolders: {len(subf)}")
You can do it this way to return you a list of absolute path files.
def list_files_recursive(path):
"""
Function that receives as a parameter a directory path
:return list_: File List and Its Absolute Paths
"""
import os
files = []
# r = root, d = directories, f = files
for r, d, f in os.walk(path):
for file in f:
files.append(os.path.join(r, file))
lst = [file for file in files]
return lst
if __name__ == '__main__':
result = list_files_recursive('/tmp')
print(result)
Source: Stackoverflow.com