text – Stefans Traum Welt

some days ago my brother asked me if i can help him batch rename about 200 pdf files..
the needed name for the file was the heading found on the first page…

so i did a quick and hacky script and experiment with PdfReader library –
it worked really nicely and was only about 2h in learning and getting it to work as it should 😉

#!/usr/bin/env python3

import os
import re

from pathlib import Path

from pypdf import PdfReader

from operator import itemgetter, attrgetter

print(42 * "*")
print("running script rename.py")
print(42 * "*")
print()



# extract ids and title
regex_title = re.compile(
    r"Some Fixed Pre Text \(LM\),\s*(?P<id1>\d+-\d+)\s*(?P<title>.*?)\s*\((?P<id2>\s*G(\d+\s*)+)\)",
    re.IGNORECASE,
)


def parse_file(filename):
    print(42 * "-")
    print(f"reading file '{filename}'")
    reader = PdfReader(filename)
    page = reader.pages[0]

    # print(42*'-')
    # print(f"extracting text from page 0:")
    # print(page.extract_text(extraction_mode="layout"))
    # print(42*'-')

    parts = []

    def visitor_body(text, cm, tm, font_dict, font_size):
        # get top part
        y = cm[5]
        if 600 < y < 1020:
            parts.append(text)

    page.extract_text(visitor_text=visitor_body)
    text_body = "".join(parts)
    # text_body = text_body.replace('\n', ' ').replace('\r', ''
    text_body = " ".join(text_body.splitlines())
    # print(f"extracting text from page 0 with filter:")
    # print(42*'-')
    # print(text_body)
    # print(42*'-')
    # now we have on continus line.
    # let us get all the parts we need with some regex magic:
    regex_result = regex_title.search(text_body)
    # print(42 * "-")
    if regex_result:
        # print(regex_result.groupdict())
        result = regex_result.groupdict()
        result["title"] = result["title"].replace('/', '-')
        result["id2"] = result["id2"].replace(' ', '')
    else:
        result = {"text_body":text_body}
        
    # print(42 * "-")
    return result


def get_filelist():
    p = Path('.')
    filelist = list(p.glob('**/*.pdf'))
    return filelist

def main():
    files = get_filelist()

    results = []

    for file in files:
        result_dict = parse_file(file)
        result_dict["filename"] = file
        # result_dict["birth"] = file.stat().st_birthtime_ns
        result_dict["birth"] = file.stat().st_mtime
        results.append(result_dict)

    # we now have a list of dicts for each file with the extracted title and ids
    # we first sort it.
    # results.sort(key=itemgetter('id1', 'title', 'id2', 'birth'))
    results.sort(key=itemgetter('id1', 'title', 'id2'))

    for result in results:
        # print(result)
        title = result["title"]
        id1 = result["id1"]
        id2 = result["id2"]
        birth = result["birth"]
        filename_old = result["filename"].resolve()
        # create new base filename_new
        stem_new = f"{id1} - {title} - {id2}"
        # print(f"stem_new: '{stem_new}'")
        # modifie the stem part
        filename_new = filename_old.with_stem(stem_new)
        print(filename_new)
        # extend if this one already exists...
        while filename_new.exists():
            filename_new = filename_new.with_stem(filename_new.stem + " - 1")
        filename_old.rename(filename_new)

main()

maybe its of help for others…

maybe just as personal memo.