From 97c833186602b6815a4c0a64b392cc7dae141bb9 Mon Sep 17 00:00:00 2001 From: devaine Date: Thu, 29 Jan 2026 15:15:27 -0600 Subject: [PATCH] feat(diff): adding file management and diff to extract files --- .gitignore | 1 + webscrape-bank/run.sh | 32 ++++++++++++++++++++++ webscrape-bank/src/extract.py | 10 +++---- webscrape-bank/src/main.py | 50 +++++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 7 deletions(-) create mode 100755 webscrape-bank/run.sh diff --git a/.gitignore b/.gitignore index d71c11c..c3d38e3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ notes # Others node_modules/ +**/__pycache__/ diff --git a/webscrape-bank/run.sh b/webscrape-bank/run.sh new file mode 100755 index 0000000..31e8b3e --- /dev/null +++ b/webscrape-bank/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +PIP_PACKAGES=("python-dotenv" "pytest-playwright") + +if [ "$1" == "del" ]; then + rm -rf .venv + echo ".venv/ removed, restart the program again" +fi + +if [ -d .venv ]; then + echo ".venv/ found!" + + source .venv/bin/activate + + python src/main.py + +else + echo ".venv/ not found!" + echo "Creating new venv/" + + python -m venv .venv + source .venv/bin/activate + + for i in "${PIP_PACKAGES[@]}" + do + pip install $i + done + + playwright install + + python src/main.py +fi diff --git a/webscrape-bank/src/extract.py b/webscrape-bank/src/extract.py index be7c735..3ab746f 100644 --- a/webscrape-bank/src/extract.py +++ b/webscrape-bank/src/extract.py @@ -2,6 +2,7 @@ from playwright.sync_api import Playwright, sync_playwright, Page from dotenv import load_dotenv import os from time import sleep +from datetime import datetime load_dotenv() @@ -76,23 +77,18 @@ def download_file(page: Page): with page.expect_download(timeout=0) as download_info: page.mouse.wheel(0, 70) - page.locator( 'iframe[title="NextGen account history page"]' ).content_frame.get_by_role("menuitem", name="Export QFX").dblclick() - download = download_info.value - download.save_as("./test.qfx") - + # Download + download.save_as("./qfx/download-" + str(datetime.now().isoformat()) + ".qfx") print("Downloaded!") switchingForever(page) - # context.close() - # browser.close() - if __name__ == "__main__": with sync_playwright() as playwright: diff --git a/webscrape-bank/src/main.py b/webscrape-bank/src/main.py index e69de29..3b53bd3 100644 --- a/webscrape-bank/src/main.py +++ b/webscrape-bank/src/main.py @@ -0,0 +1,50 @@ +from playwright.sync_api import sync_playwright +from threading import Thread +import os +import extract +#from datetime import datetime +import difflib + +# Playwright in the background +def playwright(): + with sync_playwright() as playwright: + extract.main(playwright) + + +def revise(): + # Removes the OLDEST file on the list () + file_count = 0 + for file in os.scandir("./qfx"): + print(file_count) + if file_count > 1: + #print(os.listdir("./qfx")) + print("Removed: " + os.listdir("./qfx")[0]) + os.remove("./qfx/" + os.listdir("./qfx")[0]) + + + if file.is_file(): + file_count += 1 + + # Get the last two files + oldest_file = open("./qfx/" + os.listdir("./qfx")[0]) + newest_file = open("./qfx/" + os.listdir("./qfx")[-1]) + + # Differs the two files + diff = difflib.ndiff(oldest_file.readlines(), newest_file.readlines()) + + # Grabs only changes + # Thanks to: https://stackoverflow.com/a/15864920 + changes = [l for l in diff if l.startswith("+ ") or l.startswith('- ')] + print("RESULT:") + for change in changes: + print(change[2:]) + +def main(): + revise_thread = Thread(target=revise()) + #pw_thread = Thread(target=playwright()) + #pw_thread.start() + revise_thread.start() + + +if __name__ == "__main__": + main()