[
    {
        "id": "authors:xypmg-afb90",
        "collection": "authors",
        "collection_id": "xypmg-afb90",
        "cite_using_url": "https://authors.library.caltech.edu/records/xypmg-afb90",
        "type": "conference_item",
        "title": "HGQ: High Granularity Quantization for Real-time Neural Networks on FPGAs",
        "author": [
            {
                "family_name": "Sun",
                "given_name": "Chang",
                "orcid": "0000-0003-2774-175X",
                "clpid": "Sun-Chang"
            },
            {
                "family_name": "Que",
                "given_name": "Zhiqiang",
                "orcid": "0000-0002-9263-6529"
            },
            {
                "family_name": "Aarrestad",
                "given_name": "Thea",
                "orcid": "0000-0002-7671-243X"
            },
            {
                "family_name": "Loncar",
                "given_name": "Vladimir",
                "orcid": "0000-0003-3651-0232"
            },
            {
                "family_name": "Ngadiuba",
                "given_name": "Jennifer",
                "orcid": "0000-0002-0055-2935"
            },
            {
                "family_name": "Luk",
                "given_name": "Wayne",
                "orcid": "0000-0002-6750-927X"
            },
            {
                "family_name": "Spiropulu",
                "given_name": "Maria",
                "orcid": "0000-0001-8172-7081",
                "clpid": "Spiropulu-M"
            }
        ],
        "abstract": "Neural networks with sub-microsecond inference latency are required by many critical applications. Targeting such applications deployed on FPGAs, we present High Granularity Quantization (HGQ), a quantization-aware training framework that optimizes parameter bit-widths through gradient descent. Unlike conventional methods, HGQ determines the optimal bit-width for each parameter independently, making it suitable for hardware platforms supporting heterogeneous arbitrary precision arithmetic. In our experiments, HGQ shows superior performance compared to existing network compression methods, achieving orders of magnitude reduction in resource consumption and latency while maintaining the accuracy on several benchmark tasks. These improvements enable the deployment of complex models previously infeasible due to resource or latency constraints. HGQ is open-source. https://github.com/calad0i/hgq2 and is used for developing next-generation trigger systems at the CERN ATLAS and CMS experiments for particle physics, enabling the use of advanced machine learning models for real-time data selection with sub-microsecond latency.",
        "doi": "10.1145/3748173.3779200",
        "isbn": "979-8-4007-2079-6",
        "publisher": "ACM",
        "publication": "Proceedings of the 2026 ACM/SIGDA International Symposium on Field Programmable Gate Arrays",
        "publication_date": "2026-02-21",
        "pages": "79-91"
    }
]