packages = ["pandas", "setuptools", "pdfreader-0.1.13.dev0-py3-none-any.whl"] from pdfreader import SimplePDFViewer, PDFDocument import pandas as pd from pyodide.ffi.wrappers import add_event_listener from pyodide.ffi import to_js from js import document, window, Blob, URL df = None async def get_bytes_from_file(file): array_buf = await file.arrayBuffer() return array_buf.to_bytes() async def process_file(e): global df file_list = e.target.files first_item = file_list.item(0) fd: bytes = await get_bytes_from_file(first_item) viewer = SimplePDFViewer(fd) doc = PDFDocument(fd) pages = len([p for p in doc.pages()]) extracted = [] l_keywords = [ "NEW TRANSACTIONS", "AMOUNT(S$)" ] r_keywords = [ "Total :", "CR", "DB" ] for i in range(1, pages + 1): viewer.navigate(i) viewer.render() data = viewer.canvas.strings length = len(data) l, r = 0, length - 1 while (not any([kw in data[l] for kw in l_keywords]) and l < r): l += 1 if l + 1 < length and ("PayLah! Wallet No." in data[l + 1]): l += 1 l += 1 if data[r] not in ["CR", "DB"]: while (not any([kw in data[r] for kw in r_keywords]) and l < r): r -= 1 if "Total :" in data[r - 4]: r -= 4 r += 1 s = slice(l, r) extracted += viewer.canvas.strings[s] extracted = [extracted[i : i + 5] for i in range(0, len(extracted), 5)] extracted.pop() df = pd.DataFrame(extracted, columns=["Date", "Merchant", "Ref", "Amount", "CR/DB"]) df["Amount"] = df.apply(lambda x: float(x["Amount"] if x["CR/DB"] == "CR" else -float(x["Amount"])), axis=1) df = df[["Date", "Merchant", "Amount"]] def download_file(*args): global df if df is None: return data = df.to_csv(index=False) encoded_data = data.encode('utf-8') content = to_js(encoded_data) file = Blob.new([content], {"type": "text/csv"}) url = URL.createObjectURL(file) hidden_link = document.createElement("a") hidden_link.setAttribute("download", "data.csv") hidden_link.setAttribute("href", url) hidden_link.click() add_event_listener(document.getElementById("file-upload"), "change", process_file) add_event_listener(document.getElementById("download"), "click", download_file)