Skip to content

Instantly share code, notes, and snippets.

@jbaiter
Last active June 6, 2016 11:34
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbaiter/6591110 to your computer and use it in GitHub Desktop.
Save jbaiter/6591110 to your computer and use it in GitHub Desktop.
Script to run ScanTailor on multiple CPU cores
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# The MIT License (MIT)
# Copyright (c) 2013 Johannes Baiter <johannes.baiter@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from __future__ import division, unicode_literals, print_function
import itertools
import math
import multiprocessing
import os
import shutil
import sys
import subprocess
import tempfile
from xml.etree.cElementTree import ElementTree as ET
def find_in_path(name):
""" Find executable in $PATH.
:param name: name of the executable
:type name: unicode
:returns: bool -- True if *name* is found or False
"""
return name in itertools.chain(*tuple(os.listdir(x)
for x in os.environ.get('PATH').split(':')
if os.path.exists(x)))
if not find_in_path('scantailor-cli'):
raise Exception("Could not find executable `scantailor-cli` in $PATH."
"Please install the appropriate package(s)!")
def split_configuration(projectfile, temp_dir):
num_pieces = multiprocessing.cpu_count()
tree = ET(file=projectfile)
num_files = len(tree.findall('./files/file'))
splitfiles = []
files_per_job = int(math.ceil(float(num_files)/num_pieces))
for idx in xrange(num_pieces):
tree = ET(file=projectfile)
root = tree.getroot()
start = idx*files_per_job
end = start + files_per_job
if end > num_files:
end = None
for elem in ('files', 'images', 'pages',
'file-name-disambiguation'):
elem_root = root.find(elem)
to_keep = elem_root.getchildren()[start:end]
to_remove = [x for x in elem_root.getchildren()
if not x in to_keep]
for node in to_remove:
elem_root.remove(node)
out_file = os.path.join(temp_dir,
"{0}-{1}.ScanTailor".format(
os.path.splitext(os.path.basename(
projectfile))[0], idx))
tree.write(out_file)
splitfiles.append(out_file)
return splitfiles
def generate_output(projectfile, out_dir):
print("Generating output...")
temp_dir = tempfile.mkdtemp(prefix="stmulti.")
split_config = split_configuration(projectfile, temp_dir)
print("Launching those subprocesses!")
processes = [subprocess.Popen(['scantailor-cli', '--start-filter=6',
x, out_dir])
for x in split_config]
while processes:
for p in processes[:]:
if p.poll() is not None:
processes.remove(p)
shutil.rmtree(temp_dir)
if __name__ == '__main__':
if not len(sys.argv) > 2:
print("Please specify a project file and an output directory!")
sys.exit(1)
projectfile = os.path.abspath(sys.argv[1])
out_dir = os.path.abspath(sys.argv[2])
generate_output(projectfile, out_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment