
| Current Path : /proc/self/root/usr/local/bin/ |
Linux ift1.ift-informatik.de 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64 |
| Current File : //proc/self/root/usr/local/bin/extract_tables |
#!/usr/bin/python3
import argparse
import logging
import os
import pickle
import sys
from builtins import range, str
import numpy as np
from sklearn import linear_model, metrics, preprocessing
from pdftotree.ml.TableExtractML import TableExtractorML
from pdftotree.utils.bbox_utils import compute_iou, doOverlap, isContained
def get_logger(name):
log = logging.getLogger(name)
if not log.handlers:
# Prevent logging from propagating to the root logger
log.propagate = 0
console = logging.StreamHandler()
log.addHandler(console)
formatter = logging.Formatter("[%(levelname)s] %(name)s - %(message)s")
console.setFormatter(formatter)
# Also setup pdftotree logs
pdftotree_log = logging.getLogger("pdftotree")
pdftotree_log.addHandler(console)
return log
def get_bboxes_from_line(line):
if line == "NO_TABLES":
return {}
bboxes = {}
for bbox in line.split(";"):
page_num, page_width, page_height, y0, x0, y1, x1 = bbox[1:-1].split(",")
try:
bboxes[int(page_num)] += [
(
float(page_width),
float(page_height),
float(y0),
float(x0),
float(y1),
float(x1),
)
]
except KeyError:
bboxes[int(page_num)] = [
(
float(page_width),
float(page_height),
float(y0),
float(x0),
float(y1),
float(x1),
)
]
return bboxes
def get_features_and_labels(pdf_list, gt_list, datapath):
pdf_files = [pdf_file.rstrip() for pdf_file in open(pdf_list).readlines()]
gt = [gt.rstrip() for gt in open(gt_list).readlines()]
tables = []
for i, pdf_file in enumerate(pdf_files):
log.info("Processing {} ({} of {})".format(pdf_file, i + 1, len(pdf_files)))
gt_tables = get_bboxes_from_line(gt[i])
extractor = TableExtractorML(datapath + pdf_file)
bboxes, features, is_scanned = extractor.get_candidates_and_features()
if is_scanned:
log.exception(
"{} is a scanned document, which is not supported.".format(pdf_file)
)
raise ValueError(
"Documents requiring OCR are not supported.", datapath, pdf_file
)
labels = extractor.get_labels(gt_tables)
bboxes = [[i] + list(bbox) for bbox in bboxes]
if i == 0:
X = np.array(features)
y = np.array(labels)
tables = np.array(bboxes)
else:
X = np.concatenate((X, np.array(features)), axis=0)
y = np.concatenate((y, labels), axis=0)
tables = np.concatenate((tables, bboxes), axis=0)
X = preprocessing.scale(X, axis=0)
log.info("Features computed!")
return X, y, tables
def load_train_data(pdf_list, gt_list, datapath):
if os.path.exists(pdf_list + ".features.pkl"):
log.info("Loading precomputed features for {}...".format(pdf_list))
# load pickled data
X = pickle.load(open(pdf_list + ".features.pkl", "rb"))
y = pickle.load(open(pdf_list + ".labels.pkl", "rb"))
tables = pickle.load(open(pdf_list + ".candidates.pkl", "rb"))
log.info("Features loaded!")
else:
log.info("Building feature matrix for {}".format(pdf_list))
# compute and pickle feature matrix
X, y, tables = get_features_and_labels(pdf_list, gt_list, datapath)
pickle.dump(X, open(pdf_list + ".features.pkl", "wb"))
pickle.dump(y, open(pdf_list + ".labels.pkl", "wb"))
pickle.dump(tables, open(pdf_list + ".candidates.pkl", "wb"))
return X, y, tables.astype(np.int)
def get_features(pdf_list, datapath):
pdf_files = [pdf_file.rstrip() for pdf_file in open(pdf_list).readlines()]
X = []
tables = []
scanned_indices = []
scanned_tables = {}
for i, pdf_file in enumerate(pdf_files):
log.info("Processing {} ({} of {})".format(pdf_file, i + 1, len(pdf_files)))
extractor = TableExtractorML(datapath + pdf_file)
bboxes, features, is_scanned = extractor.get_candidates_and_features()
if is_scanned:
log.exception(
"{} is a scanned document, which is not supported.".format(pdf_file)
)
raise ValueError(
"Documents requiring OCR are not supported.", datapath, pdf_file
)
bboxes = [[i] + list(bbox) for bbox in bboxes]
if len(X) == 0:
X = np.array(features)
tables = np.array(bboxes)
else:
X = np.concatenate((X, np.array(features)), axis=0)
tables = np.concatenate((tables, bboxes), axis=0)
if len(X) > 0:
X = preprocessing.scale(X, axis=0)
log.info("Features computed!")
return X, tables, scanned_indices, scanned_tables
def load_test_data(pdf_list, datapath):
if os.path.exists(pdf_list + ".features.pkl"):
log.info("Loading precomputed features for {}...".format(pdf_list))
# load pickled data
X = pickle.load(open(pdf_list + ".features.pkl", "rb"))
tables = pickle.load(open(pdf_list + ".candidates.pkl", "rb"))
scanned_indices = pickle.load(open(pdf_list + ".scanned_indices.pkl", "rb"))
scanned_tables = pickle.load(open(pdf_list + ".scanned_tables.pkl", "rb"))
log.info("Features loaded!")
else:
log.info("Building feature matrix for {}".format(pdf_list))
# compute and pickle feature matrix
X, tables, scanned_indices, scanned_tables = get_features(pdf_list, datapath)
pickle.dump(X, open(pdf_list + ".features.pkl", "wb"))
pickle.dump(tables, open(pdf_list + ".candidates.pkl", "wb"))
pickle.dump(scanned_indices, open(pdf_list + ".scanned_indices.pkl", "wb"))
pickle.dump(scanned_tables, open(pdf_list + ".scanned_tables.pkl", "wb"))
return X, tables.astype(np.int), scanned_indices, scanned_tables
def compute_overlap_matrix(pdf_bboxes, iou_thresh):
nb_tables = len(pdf_bboxes)
overlap = np.zeros((nb_tables, nb_tables))
for i, bb1 in enumerate(pdf_bboxes):
for j, bb2 in enumerate(pdf_bboxes):
if i != j and bb1[0] == bb2[0] and doOverlap(bb1[-4:], bb2[-4:]):
iou = compute_iou(bb1[-4:], bb2[-4:])
if (
iou > iou_thresh
or isContained(bb1[-4:], bb2[-4:])
or isContained(bb2[-4:], bb1[-4:])
):
overlap[i, j] = 1.
return overlap
def filter_overlapping_bboxes(overlap_bboxes):
areas = []
for bbox in overlap_bboxes:
top, left, bottom, right = bbox[-4:]
areas.append((bottom - top) * (right - left))
bbox = overlap_bboxes[np.argmax(areas)]
return bbox
def remove_duplicates(pdf_bboxes, iou_thresh):
filtered_bboxes = []
overlap = compute_overlap_matrix(pdf_bboxes, iou_thresh)
bboxes_idx = np.arange(len(pdf_bboxes))
for i in bboxes_idx:
if i == -1:
# ignore this bbox
pass
elif np.sum(overlap[i]) == 0:
# add this bbox since it does not overlap with any other bboxes
filtered_bboxes.append(tuple(pdf_bboxes[i]))
else:
# we take the bbox with maximum area among overlapping bounding boxes
overlap_idx = np.concatenate(([i], np.flatnonzero(overlap[i])))
overlap_bboxes = pdf_bboxes[overlap_idx]
bbox = filter_overlapping_bboxes(overlap_bboxes)
filtered_bboxes.append(tuple(bbox))
bboxes_idx[overlap_idx] = -1
return filtered_bboxes
def compute_stats(y_pred, y_test):
recall = metrics.recall_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
accuracy = metrics.accuracy_score(y_test, y_pred)
log.info("Classification Metrics:")
log.info(
"(Note that these statistics are not for the table detection task but for the classification problem."
)
log.info(
"To run evaluation for the table detection class, refer to the script char_level_evaluation.py)"
)
log.info("Precision: ", precision)
log.info("Recall: ", recall)
log.info("F1-score: ", 2 * precision * recall / (precision + recall))
log.info("Accuracy:", accuracy)
def train_model(X_train, y_train, model_path):
log.info("Training model...")
logistic = linear_model.LogisticRegression()
logistic.fit(X_train, y_train)
log.info("Model trained!")
pickle.dump(logistic, open(model_path, "wb"))
log.info("Model saved!")
return logistic
def load_model(model_path):
log.info("Loading pretrained model...")
model = pickle.load(open(model_path, "rb"))
log.info("Model loaded!")
return model
def filter_bboxes(tables, iou_thresh):
pdf_idx_to_filtered_bboxes = {}
pdf_idx = tables[0][0]
bboxes = []
for table in tables:
if table[0] == pdf_idx:
bboxes.append(tuple(table[1:]))
else:
filtered_bboxes = remove_duplicates(np.array(bboxes), iou_thresh)
pdf_idx_to_filtered_bboxes[pdf_idx] = filtered_bboxes
pdf_idx = table[0]
bboxes = [tuple(table[1:])]
return pdf_idx_to_filtered_bboxes
def bbox_to_dict(tables):
bbox_dict = {}
for table in tables:
try:
bbox_dict[table[0]] += [tuple(table[1:])]
except KeyError:
bbox_dict[table[0]] = [tuple(table[1:])]
return bbox_dict
def write_bbox_to_file(
bbox_file, pdf_idx_to_filtered_bboxes, num_test, scanned_test, scanned_tables
):
for i in range(num_test):
if i in scanned_test:
bbox_file.write(scanned_tables[i])
continue
try:
filtered_bboxes = pdf_idx_to_filtered_bboxes[i]
bbox_file.write(";".join([str(bbox) for bbox in filtered_bboxes]) + "\n")
except KeyError:
bbox_file.write("NO_TABLES\n")
bbox_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""
Script to extract tables bounding boxes from PDF files using machine
learning. If `model.pkl` is saved in the model-path, the pickled
model will be used for prediction. Otherwise the model will be
retrained. If --mode is test (by default), the script will create a
.bbox file containing the tables for the pdf documents listed in the
file --test-pdf. If --mode is dev, the script will also extract ground
truth labels for the test data and compute statistics.
"""
)
parser.add_argument(
"--mode",
type=str,
default="test",
help="Usage mode dev or test, default is test",
)
parser.add_argument(
"--model-path",
type=str,
required=True,
help="Path to the model. If the file exists, it will be used. Otherwise, a new model will be trained.",
)
parser.add_argument(
"--train-pdf",
type=str,
help="List of pdf file names used for training. These files must be saved in the --datapath directory. Required if no pretrained model is provided.",
)
parser.add_argument(
"--test-pdf",
type=str,
required=True,
help="List of pdf file names used for testing. These files must be saved in the --datapath directory.",
)
parser.add_argument(
"--gt-train",
type=str,
help="Ground truth train tables. Required if no pretrained model is provided.",
)
parser.add_argument(
"--gt-test", type=str, required=True, help="Ground truth test tables."
)
parser.add_argument(
"--datapath",
type=str,
required=True,
help="Path to directory containing the input documents.",
)
parser.add_argument(
"--iou-thresh",
type=float,
default=0.8,
help="Intersection over union threshold to remove duplicate tables",
)
parser.add_argument(
"-v", dest="verbose", action="store_true", help="Output INFO level logging"
)
parser.add_argument(
"-vv", dest="debug", action="store_true", help="Output DEBUG level logging"
)
args = parser.parse_args()
if args.debug:
log_level = logging.DEBUG
elif args.verbose:
log_level = logging.INFO
else:
log_level = logging.WARNING
log = get_logger(__name__)
log.setLevel(log_level)
# load or train the model
if args.model_path and os.path.exists(args.model_path):
model = load_model(args.model_path)
else:
if not args.train_pdf or not args.gt_train:
parser.error(
"--train-pdf and --gt-train must be provided if there is no pretrained model."
)
X_train, y_train, tables_train = load_train_data(
args.train_pdf, args.gt_train, args.datapath
)
model = train_model(X_train, y_train, args.model_path)
num_test = len(open(args.test_pdf).readlines())
if args.mode == "dev":
# load dev data (with ground truth, for evaluation)
X_test, y_test, tables_test = load_train_data(
args.test_pdf, args.gt_test, args.datapath
)
# predict tables for dev tables and evaluate
y_pred = model.predict(X_test)
log.info("Testing for {} pdf documents".format(num_test))
compute_stats(y_pred, y_test)
elif args.mode == "test":
# load test data (with no ground truth)
X_test, tables_test, scanned_test, scanned_tables = load_test_data(
args.test_pdf, args.datapath
)
if len(X_test) == 0: # all docs are scanned
y_pred = []
else:
y_pred = model.predict(X_test)
else:
log.error("Mode not recognized, pick dev or test.")
sys.exit()
if len(y_pred) != 0:
predicted_tables = tables_test[np.flatnonzero(y_pred)]
# todo: remove duplicate tables
# pdf_idx_to_filtered_bboxes = filter_bboxes(predicted_tables, args.iou_thresh)
pdf_idx_to_filtered_bboxes = bbox_to_dict(predicted_tables)
# write tables to file
else:
pdf_idx_to_filtered_bboxes = []
bbox_file = open(args.test_pdf + ".bbox", "w")
write_bbox_to_file(
bbox_file, pdf_idx_to_filtered_bboxes, num_test, scanned_test, scanned_tables
)