Skip to content

Commit

Permalink
Merge pull request #229 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Significant speedup
  • Loading branch information
VikParuchuri committed Jul 12, 2024
2 parents 0f792b5 + d427c4d commit 054fac1
Show file tree
Hide file tree
Showing 5 changed files with 450 additions and 427 deletions.
6 changes: 6 additions & 0 deletions convert_single.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

import pypdfium2 # Needs to be at the top to avoid warnings
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
Expand All @@ -20,18 +22,22 @@ def main():
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
args = parser.parse_args()

langs = args.langs.split(",") if args.langs else None

fname = args.filename
model_lst = load_all_models()
start = time.time()
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)

fname = os.path.basename(fname)
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)

print(f"Saved markdown to the {subfolder_path} folder")
if args.debug:
print(f"Total time: {time.time() - start}")


if __name__ == "__main__":
Expand Down
14 changes: 7 additions & 7 deletions marker/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


from marker.postprocessors.editor import load_editing_model
from surya.model.detection import segformer
from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
from texify.model.model import load_model as load_texify_model
from texify.model.processor import load_processor as load_texify_processor
from marker.settings import settings
Expand All @@ -25,11 +25,11 @@ def setup_recognition_model(langs, device=None, dtype=None):

def setup_detection_model(device=None, dtype=None):
if device:
model = segformer.load_model(device=device, dtype=dtype)
model = load_detection_model(device=device, dtype=dtype)
else:
model = segformer.load_model()
model = load_detection_model()

processor = segformer.load_processor()
processor = load_detection_processor()
model.processor = processor
return model

Expand All @@ -46,10 +46,10 @@ def setup_texify_model(device=None, dtype=None):

def setup_layout_model(device=None, dtype=None):
if device:
model = segformer.load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT, device=device, dtype=dtype)
model = load_detection_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT, device=device, dtype=dtype)
else:
model = segformer.load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
processor = segformer.load_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
model = load_detection_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
processor = load_detection_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
model.processor = processor
return model

Expand Down
6 changes: 1 addition & 5 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
# Layout model
SURYA_LAYOUT_DPI: int = 96
BAD_SPAN_TYPES: List[str] = ["Caption", "Footnote", "Page-footer", "Page-header", "Picture"]
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout2"
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout3"
BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same
LAYOUT_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise

Expand All @@ -83,10 +83,6 @@ def TORCH_DEVICE_MODEL(self) -> str:
ENABLE_EDITOR_MODEL: bool = False # The editor model can create false positives
EDITOR_CUTOFF_THRESH: float = 0.9 # Ignore predictions below this probability

# Ray
RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker

# Debug
DEBUG: bool = False # Enable debug logging
DEBUG_DATA_FOLDER: Optional[str] = None
Expand Down
Loading

0 comments on commit 054fac1

Please sign in to comment.