@article{mara2021thallo,
    author = {Mara, Michael and Heide, Felix and Zollh\"{o}fer, Michael and Nie\ss{}ner, Matthias and Hanrahan, Pat},
    title = {Thallo – Scheduling for High-Performance Large-Scale Non-Linear Least-Squares Solvers},
    year = {2021},
    issue_date = {October 2021},
    publisher = {Association for Computing Machinery},
    address = {New York, NY, USA},
    volume = {40},
    number = {5},
    issn = {0730-0301},
    url = {https://doi.org/10.1145/3453986},
    doi = {10.1145/3453986},
    abstract = {Large-scale optimization problems at the core of many graphics, vision, and imaging applications are often implemented by hand in tedious and error-prone processes in order to achieve high performance (in particular on GPUs), despite recent developments in libraries and DSLs. At the same time, these hand-crafted solver implementations reveal that the key for high performance is a problem-specific schedule that enables efficient usage of the underlying hardware. In this work, we incorporate this insight into Thallo, a domain-specific language for large-scale non-linear least squares optimization problems. We observe various code reorganizations performed by implementers of high-performance solvers in the literature, and then define a set of basic operations that span these scheduling choices, thereby defining a large scheduling space. Users can either specify code transformations in a scheduling language or use an autoscheduler. Thallo takes as input a compact, shader-like representation of an energy function and a (potentially auto-generated) schedule, translating the combination into high-performance GPU solvers. Since Thallo can generate solvers from a large scheduling space, it can handle a large set of large-scale non-linear and non-smooth problems with various degrees of non-locality and compute-to-memory ratios, including diverse applications such as bundle adjustment, face blendshape fitting, and spatially-varying Poisson deconvolution, as seen in Figure&nbsp;1. Abstracting schedules from the optimization, we outperform state-of-the-art GPU-based optimization DSLs by an average of 16\texttimes{} across all applications introduced in this work, and even some published hand-written GPU solvers by 30\%+.},
    journal = {ACM Trans. Graph.},
    month = {sep},
    articleno = {184},
    numpages = {14},
    keywords = {optimization, scheduling, GPU, 3D Reconstruction, non-linear least-squares, DSL}
}