some days ago my brother asked me if i can help him batch rename about 200 pdf files..
the needed name for the file was the heading found on the first page…
so i did a quick and hacky script and experiment with PdfReader library –
it worked really nicely and was only about 2h in learning and getting it to work as it should 😉
#!/usr/bin/env python3
import os
import re
from pathlib import Path
from pypdf import PdfReader
from operator import itemgetter, attrgetter
print(42 * "*")
print("running script rename.py")
print(42 * "*")
print()
# extract ids and title
regex_title = re.compile(
r"Some Fixed Pre Text \(LM\),\s*(?P<id1>\d+-\d+)\s*(?P<title>.*?)\s*\((?P<id2>\s*G(\d+\s*)+)\)",
re.IGNORECASE,
)
def parse_file(filename):
print(42 * "-")
print(f"reading file '{filename}'")
reader = PdfReader(filename)
page = reader.pages[0]
# print(42*'-')
# print(f"extracting text from page 0:")
# print(page.extract_text(extraction_mode="layout"))
# print(42*'-')
parts = []
def visitor_body(text, cm, tm, font_dict, font_size):
# get top part
y = cm[5]
if 600 < y < 1020:
parts.append(text)
page.extract_text(visitor_text=visitor_body)
text_body = "".join(parts)
# text_body = text_body.replace('\n', ' ').replace('\r', ''
text_body = " ".join(text_body.splitlines())
# print(f"extracting text from page 0 with filter:")
# print(42*'-')
# print(text_body)
# print(42*'-')
# now we have on continus line.
# let us get all the parts we need with some regex magic:
regex_result = regex_title.search(text_body)
# print(42 * "-")
if regex_result:
# print(regex_result.groupdict())
result = regex_result.groupdict()
result["title"] = result["title"].replace('/', '-')
result["id2"] = result["id2"].replace(' ', '')
else:
result = {"text_body":text_body}
# print(42 * "-")
return result
def get_filelist():
p = Path('.')
filelist = list(p.glob('**/*.pdf'))
return filelist
def main():
files = get_filelist()
results = []
for file in files:
result_dict = parse_file(file)
result_dict["filename"] = file
# result_dict["birth"] = file.stat().st_birthtime_ns
result_dict["birth"] = file.stat().st_mtime
results.append(result_dict)
# we now have a list of dicts for each file with the extracted title and ids
# we first sort it.
# results.sort(key=itemgetter('id1', 'title', 'id2', 'birth'))
results.sort(key=itemgetter('id1', 'title', 'id2'))
for result in results:
# print(result)
title = result["title"]
id1 = result["id1"]
id2 = result["id2"]
birth = result["birth"]
filename_old = result["filename"].resolve()
# create new base filename_new
stem_new = f"{id1} - {title} - {id2}"
# print(f"stem_new: '{stem_new}'")
# modifie the stem part
filename_new = filename_old.with_stem(stem_new)
print(filename_new)
# extend if this one already exists...
while filename_new.exists():
filename_new = filename_new.with_stem(filename_new.stem + " - 1")
filename_old.rename(filename_new)
main()
maybe its of help for others…
maybe just as personal memo.