#! /usr/bin/env python3


import itertools
import re
import sqlite3

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.layout import LAParams

from pdf2data.pdf import (
        PageIterator,
        find_attr_group_matching, GroupNotFound,
        get_attr_lookup,
        find_lines_with,
        find_row_table, find_col_table,
        merge_overlapping_rows)
from pdf2data.data import set_up_table, row_to_db


def verify_that_fonts_are_known(lines):
    for line in lines:
        assert (
                "ArialMT" in line.fontname
                or "Arial-BoldMT" in line.fontname
                or line.fontname == "Arial"
                or line.fontname == "Arial Bold"
                ), line.fontname


# {{{ shared infrastructure

class EmptyDataPage(RuntimeError):
    pass


def get_fy_period(lines):
    fy_period_matches = find_lines_with(
            r"FY ([0-9]{2,4})\s+Period ([0-9]{1,2})", lines)

    if fy_period_matches:
        (_, fy_period_match), = fy_period_matches
        fy = int(fy_period_match.group(1))
        if fy < 100:
            # FIXME: 2k1 (!)
            fy = fy + 2000

        period = int(fy_period_match.group(2))

        return fy, period

    else:
        fy_matches = find_lines_with(
                r"Fiscal Year:\s+([0-9]{4})\s+Start", lines)
        (_, fy_match), = fy_matches
        print("    no period found")

        return int(fy_match.group(1)), None


def convert_number(s):
    return float(s.replace(",", ""))

# }}}


# {{{ transactions

TRANSACTIONS_DATE_RE = re.compile("^([0-9]{2})-([A-Z]{3})-([0-9]{4})$")
MONTHS = "JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC".split()
assert len(MONTHS) == 12


def convert_transactions_date(s):
    date_match = TRANSACTIONS_DATE_RE.match(s)
    assert date_match is not None

    import datetime
    date = datetime.date(
            day=int(date_match.group(1)),
            month=1+MONTHS.index(date_match.group(2)),
            year=int(date_match.group(3)))

    return date.isoformat()


def get_transactions_page_account(lines, db_cursor):
    try:
        metadata_top_y0 = find_attr_group_matching(
                ["Chart", "Level", "Fund Term Dt"],
                "y0", lines)
    except GroupNotFound:
        try:
            find_attr_group_matching(["Chart"], "y0", lines)
        except GroupNotFound:
            raise EmptyDataPage()

    metadata_bottom_y0 = find_attr_group_matching(
            ["Sponsor", "Budget End Date"],
            "y0", lines)

    metadata_lines = [
            l for l in lines if metadata_bottom_y0 <= l.y0 <= metadata_top_y0]

    metadata_xlookup = get_attr_lookup(metadata_lines, "x0")
    metadata_x0s = sorted(metadata_xlookup)
    cfop_lines = list(itertools.chain.from_iterable(
            metadata_xlookup[x0] for x0 in metadata_x0s[1:3]))
    cfop_headers = metadata_xlookup[metadata_x0s[0]]

    cfop_data, cfop_descr = find_col_table(cfop_headers, cfop_lines)
    cfop_data.update(
            {key+"_descr": value for key, value in cfop_descr.items()})

    return row_to_db(
            db_cursor, "account",
            {k: v.text for k, v in cfop_data.items()},
            upsert_unique_cols=(
                "chart", "organization",
                "fund", "program"))


def read_transactions(lines):
    trans_top_y0 = find_attr_group_matching(
            ["Account", "Description", "Date", "RC"], "y0", lines)

    trans_bottom_y0 = find_attr_group_matching(
            ["Printed: [0-9]{2}-[A-Z]+-[0-9]{4} [0-9]{2}"],
            "y0", lines)

    trans_headers = [l for l in lines if abs(l.y0 - trans_top_y0) < 4]
    trans_tbl_top_y0 = min(l.y0 for l in trans_headers)

    # handle wrapped, offset header line "Purchase\nOrder Code"
    trans_headers = [l for l in trans_headers if l.text != "Purchase"]

    trans_lines = [
            l for l in lines
            if trans_bottom_y0 < l.y0 < trans_tbl_top_y0]

    return find_row_table(trans_headers, trans_lines)


def is_bogus_transaction_row(row):
    # ignore odd misaligned lines with little information
    return (
            (len(row) == 1 and "Seq" in row)
            or
            # handle wrapped, offset header line "Purchase\nOrder Code"
            (len(row) == 1 and "Purchase" in row
                and row["Purchase"].text == "Order Code"))


def read_transactions_pages(page_it, db_cursor):
    try:
        account_id = get_transactions_page_account(page_it.lines, db_cursor)
    except EmptyDataPage:
        page_it.advance()
        return

    fy, period = get_fy_period(page_it.lines)

    trans_rows = []

    while True:
        verify_that_fonts_are_known(page_it.lines)
        page_trans_rows = read_transactions(page_it.lines)
        trans_rows.extend(page_trans_rows)

        end_of_transactions = False
        for row in page_trans_rows:
            if is_bogus_transaction_row(row):
                continue

            if (
                    "Bold" in row["Account"].fontname
                    and row["Account"].text == "Net Totals"):
                end_of_transactions = True

        page_it.advance()

        if end_of_transactions:
            break

    # {{{ add descriptions from subtotal lines

    new_trans_rows = []
    row_batch = []

    for row in trans_rows:
        if is_bogus_transaction_row(row):
            continue

        if "Description" not in row:
            # ignore (computed) total lines
            assert (
                    row["Account"].text.startswith("Total")
                    or row["Account"].text.startswith("Net Totals"))

            assert "Bold" in row["Account"].fontname
            continue

        if "Bold" in row["Account"].fontname:
            acct_nr = row["Account"].text
            for batch_row in row_batch:
                assert batch_row["Account"] == acct_nr
                batch_row["Account Descr"] = row["Description"].text

            new_trans_rows.extend(row_batch)
            del row_batch[:]

        else:
            row = {k: v.text for k, v in row.items()}
            row["in_account_id"] = account_id
            row["fy"] = fy
            row["period"] = period
            row_batch.append(row)

    assert not row_batch

    trans_rows = new_trans_rows
    del new_trans_rows

    # }}}

    for row in trans_rows:
        row_to_db(
                db_cursor, "trans", row,
                col_name_mapping={
                    "Encumbrances": "encumbrance",
                    "Document #": "document_nr",
                    "Document": "document_nr",
                    "Deposit #": "deposit_nr",
                    "Description": "descr",
                    "Order Code": "purchase_order_code",
                    },
                value_converters={
                    "Date": convert_transactions_date,
                    "Budget": convert_number,
                    "Actual": convert_number,
                    "Encumbrances": convert_number,
                    })

# }}}


# {{{ payroll

PAYROLL_DATE_RE = re.compile("^([0-9]{2})/([0-9]{2})/([0-9]{4})$")


def convert_payroll_date(s):
    date_match = PAYROLL_DATE_RE.match(s)
    assert date_match is not None

    import datetime
    date = datetime.date(
            day=int(date_match.group(2)),
            month=int(date_match.group(1)),
            year=int(date_match.group(3)))

    return date.isoformat()


def get_payroll_page_account(lines, db_cursor):
    try:
        metadata_top_y0 = find_attr_group_matching(
                ["Chart", "Principal Investigator", "Grant Code"],
                "y0", lines)
    except GroupNotFound:
        try:
            find_attr_group_matching(["Chart"], "y0", lines)
        except GroupNotFound:
            raise EmptyDataPage()

    metadata_bottom_y0 = find_attr_group_matching(
            ["Account", "Name", "UIN"],
            "y0", lines)

    metadata_lines = [
            l for l in lines
            if metadata_bottom_y0 + 5 <= l.y0 <= metadata_top_y0]

    metadata_xlookup = get_attr_lookup(metadata_lines, "x0")
    metadata_x0s = sorted(metadata_xlookup)
    cfop_lines = list(itertools.chain.from_iterable(
            metadata_xlookup[x0] for x0 in metadata_x0s[1:3]))
    cfop_headers = metadata_xlookup[metadata_x0s[0]]

    cfop_data, cfop_descr = find_col_table(cfop_headers, cfop_lines)
    cfop_data.update(
            {key+"_descr": value for key, value in cfop_descr.items()})

    return row_to_db(
            db_cursor, "account",
            {k: v.text for k, v in cfop_data.items()},
            upsert_unique_cols=(
                "chart", "organization",
                "fund", "program"))


def read_payroll_rows(lines):
    proll_top_y0 = find_attr_group_matching(
            ["Suff", "Cat", "Seq", "Rate"], "y0", lines)

    proll_lines = [l for l in lines if l.y0 < proll_top_y0]
    proll_header_y0 = find_attr_group_matching(
            ["Account", "UIN", "EC", "Posn"], "y0", lines)
    trans_headers = sorted(
            [l for l in lines if l.y0 == proll_header_y0],
            key=lambda l: l.x0)

    pp1, pp2, pp3 = [
            i
            for i, h in enumerate(trans_headers)
            if h.text == "PayPeriod"]

    trans_headers[pp1] = trans_headers[pp1].copy("Pay Period Code")
    trans_headers[pp2] = trans_headers[pp2].copy("Pay Period Begin")
    trans_headers[pp3] = trans_headers[pp3].copy("Pay Period End")

    pp1, pp2 = [
            i
            for i, h in enumerate(trans_headers)
            if h.text == "Posn"]

    trans_headers[pp1] = trans_headers[pp1].copy("Posn")
    trans_headers[pp2] = trans_headers[pp2].copy("Posn Suff")

    pp1, pp2 = [
            i
            for i, h in enumerate(trans_headers)
            if h.text == "Pay"]

    trans_headers[pp1] = trans_headers[pp1].copy("Pay Cat")
    trans_headers[pp2] = trans_headers[pp2].copy("Pay Seq")

    return find_row_table(trans_headers, proll_lines, heading_bias="min")


def read_payroll_pages(page_it, db_cursor):
    try:
        account_id = get_payroll_page_account(page_it.lines, db_cursor)
    except EmptyDataPage:
        page_it.advance()
        return

    fy, period = get_fy_period(page_it.lines)

    proll_rows = []

    while True:
        verify_that_fonts_are_known(page_it.lines)
        page_proll_rows = read_payroll_rows(page_it.lines)
        page_proll_rows = merge_overlapping_rows(
                page_proll_rows, "y0", "y1")

        proll_rows.extend(page_proll_rows)

        end_of_transactions = False
        for row in page_proll_rows:
            if (
                    "Bold" in row["Name"].fontname
                    and row["Name"].text == "Total Personnel Expense"):
                end_of_transactions = True

        page_it.advance()

        if end_of_transactions:
            break

    # {{{ add descriptions from subtotal lines

    new_proll_rows = []
    row_batch = []

    for row in proll_rows:
        if "Account" not in row:
            # ignore (computed) total lines
            assert row["Name"].text.startswith("Total")

            assert "Bold" in row["Name"].fontname
            continue

        if "Bold" in row["Account"].fontname:
            acct_nr = row["Account"].text
            for batch_row in row_batch:
                assert batch_row["Account"] == acct_nr
                batch_row["Account Descr"] = row["Name"].text

            new_proll_rows.extend(row_batch)
            del row_batch[:]

        else:
            row = {k: v.text for k, v in row.items()}
            row["in_account_id"] = account_id
            row["fy"] = fy
            row["period"] = period
            row_batch.append(row)

    assert not row_batch

    proll_rows = new_proll_rows
    del new_proll_rows

    # }}}

    for row in proll_rows:
        row_to_db(
                db_cursor, "payroll", row,
                col_name_mapping={
                    "Doc Num": "document_nr",
                    "Fringe": "fringe_rate",
                    },
                value_converters={
                    "Pay Period Begin": convert_payroll_date,
                    "Pay Period End": convert_payroll_date,
                    "Amount": convert_number,
                    "Hours": convert_number,
                    "FTE": convert_number,
                    "Fringe": convert_number,
                    })
# }}}


# {{{ top level

def read_statement(pdf_fileobj, db, *, start_page=None, end_page=None):
    set_up_table(
            db, "account", (
                ("chart", "text"),
                ("chart_descr", "text"),
                ("organization", "text"),
                ("organization_descr", "text"),
                ("fund", "text"),
                ("fund_descr", "text"),
                ("program", "text"),
                ("program_descr", "text"),
                ("fund_type", "text"),
                ("fund_type_descr", "text"),
                ("sponsor", "text"),
                ("sponsor_descr", "text"),
                ))

    set_up_table(
            db, "trans", (
                ("fy", "num"),
                ("period", "num"),
                ("in_account_id", "num"),

                ("account", "text"),
                ("account_descr", "text"),
                ("descr", "text"),
                ("date", "text"),
                ("rc", "text"),
                ("document_nr", "text"),
                ("seq", "num"),
                ("purchase_order_code", "text"),
                ("doc_ref", "text"),
                ("deposit_nr", "num"),
                ("budget", "num"),
                ("actual", "num"),
                ("encumbrance", "num"),
                ("activity", "text"),
                ("location", "text"),
            ))

    set_up_table(
            db, "payroll", (
                ("fy", "num"),
                ("period", "num"),
                ("in_account_id", "num"),

                ("account", "text"),
                ("account_descr", "text"),
                ("name", "text"),
                ("uin", "text"),
                ("ec", "text"),
                ("posn", "text"),
                ("posn_suff", "text"),
                ("pay_cat", "text"),
                ("pay_seq", "text"),
                ("fte", "num"),
                ("pay_period_code", "text"),
                ("pay_period_begin", "text"),
                ("pay_period_end", "text"),
                ("hours", "num"),
                ("fringe_rate", "num"),
                ("amount", "num"),
                ("document_nr", "text"),
                ("activity", "text"),
                ("location", "text"),
                ))

    document = PDFDocument(PDFParser(pdf_fileobj), None)

    page_it = PageIterator(document, LAParams(char_margin=0.2))

    if start_page is not None:
        while page_it.i_page < start_page:
            page_it.advance()

    while not page_it.is_at_end:
        if end_page is not None and page_it.i_page >= end_page:
            break

        print(f"page {page_it.i_page}")

        if (
                find_lines_with(
                    "FIOL Revenue/Expense/Transfer Transactions",
                    page_it.lines)
                or
                find_lines_with(
                    "FIGL_Operating_Ledger_Transaction_Statement",
                    page_it.lines)
                ):

            read_transactions_pages(page_it, db.cursor())
        elif (
                find_lines_with(
                    "HPAY Payroll Expense by Person Month End", page_it.lines)
                or
                find_lines_with(
                    "FIPR_Payroll_Labor_Distribution_Month_End", page_it.lines)
                or
                find_lines_with(
                    "FIPR_Payroll_Expense_by_Person_Month_End", page_it.lines)
                ):
            read_payroll_pages(page_it, db.cursor())
        else:
            raise ValueError(f"unexpected page type: {page_it.i_page}")


def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("statement_pdf", metavar="FILENAME.PDF")
    parser.add_argument(
            "--database", metavar="FILENAME.SQLITE3",
            default="statements.sqlite3")
    parser.add_argument(
            "--start-page", metavar="PAGENO", type=int)
    parser.add_argument(
            "--end-page", metavar="PAGENO", type=int)
    args = parser.parse_args()

    with open(args.statement_pdf, 'rb') as pdf_fileobj:
        with sqlite3.connect(args.database) as db:
            read_statement(
                    pdf_fileobj, db,
                    start_page=args.start_page, end_page=args.end_page)

# }}}


if __name__ == "__main__":
    main()

# vim: foldmethod=marker
