
| Current Path : /usr/local/bin/ |
Linux ift1.ift-informatik.de 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64 |
| Current File : //usr/local/bin/pdftotree |
#!/usr/bin/python3
"""Simple commandline interface for parsing PDF to hOCR."""
import argparse
import logging
import os
import pdftotree
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""
Convert PDF into hOCR.
""",
usage="%(prog)s [options] pdf_file",
)
parser.add_argument(
"-mt",
"--model_type",
type=str,
default=None,
choices=["vision", "ml", None],
help="Model type to use. None (default) for heuristics approach.",
)
parser.add_argument(
"-m",
"--model_path",
type=str,
default=None,
help="Pretrained model, generated by extract_tables tool",
)
parser.add_argument(
"pdf_file",
type=str,
help="Path to input PDF file.",
)
parser.add_argument(
"-o",
"--output",
type=str,
help="Path to output hOCR file. If not given, it will be printed to stdout.",
)
parser.add_argument(
"-V",
"--visualize",
dest="visualize",
action="store_true",
help="Whether to output visualization images for the tree",
)
parser.add_argument(
"-v",
"--verbose",
dest="verbose",
action="store_true",
help="Output INFO level logging.",
)
parser.add_argument(
"-vv",
"--veryverbose",
dest="debug",
action="store_true",
help="Output DEBUG level logging.",
)
parser.set_defaults(visualize=False)
args = parser.parse_args()
if args.debug:
log_level = logging.DEBUG
elif args.verbose:
log_level = logging.INFO
else:
log_level = logging.ERROR
if bool(args.model_type) != bool(args.model_path):
parser.error("Both a model_type and a model_path must be provided together.")
elif args.model_type and not os.path.exists(args.model_path):
parser.error("A valid path to a pretrained model must be provided.")
# Configure logging for this application
log = logging.getLogger("pdftotree")
log.propagate = 0 # prevent propagation to the root logger
ch = logging.StreamHandler()
log.setLevel(log_level)
ch.setLevel(log_level)
formatter = logging.Formatter("[%(levelname)s] %(name)s - %(message)s")
ch.setFormatter(formatter)
log.addHandler(ch)
# Call the main routine
result = pdftotree.parse(
args.pdf_file,
args.output,
args.model_type,
args.model_path,
args.visualize,
)
if args.output is None:
print(result)
else:
print("hOCR output to {}".format(args.output))