120 lines
4.2 KiB
Python
120 lines
4.2 KiB
Python
# SPDX-FileCopyrightText: 2015 Eric Larson
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
from __future__ import annotations
|
|
|
|
import mmap
|
|
from tempfile import NamedTemporaryFile
|
|
from typing import TYPE_CHECKING, Any, Callable
|
|
|
|
if TYPE_CHECKING:
|
|
from http.client import HTTPResponse
|
|
|
|
|
|
class CallbackFileWrapper:
|
|
"""
|
|
Small wrapper around a fp object which will tee everything read into a
|
|
buffer, and when that file is closed it will execute a callback with the
|
|
contents of that buffer.
|
|
|
|
All attributes are proxied to the underlying file object.
|
|
|
|
This class uses members with a double underscore (__) leading prefix so as
|
|
not to accidentally shadow an attribute.
|
|
|
|
The data is stored in a temporary file until it is all available. As long
|
|
as the temporary files directory is disk-based (sometimes it's a
|
|
memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory
|
|
pressure is high. For small files the disk usually won't be used at all,
|
|
it'll all be in the filesystem memory cache, so there should be no
|
|
performance impact.
|
|
"""
|
|
|
|
def __init__(
|
|
self, fp: HTTPResponse, callback: Callable[[bytes], None] | None
|
|
) -> None:
|
|
self.__buf = NamedTemporaryFile("rb+", delete=True)
|
|
self.__fp = fp
|
|
self.__callback = callback
|
|
|
|
def __getattr__(self, name: str) -> Any:
|
|
# The vagaries of garbage collection means that self.__fp is
|
|
# not always set. By using __getattribute__ and the private
|
|
# name[0] allows looking up the attribute value and raising an
|
|
# AttributeError when it doesn't exist. This stop things from
|
|
# infinitely recursing calls to getattr in the case where
|
|
# self.__fp hasn't been set.
|
|
#
|
|
# [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers
|
|
fp = self.__getattribute__("_CallbackFileWrapper__fp")
|
|
return getattr(fp, name)
|
|
|
|
def __is_fp_closed(self) -> bool:
|
|
try:
|
|
return self.__fp.fp is None
|
|
|
|
except AttributeError:
|
|
pass
|
|
|
|
try:
|
|
closed: bool = self.__fp.closed
|
|
return closed
|
|
|
|
except AttributeError:
|
|
pass
|
|
|
|
# We just don't cache it then.
|
|
# TODO: Add some logging here...
|
|
return False
|
|
|
|
def _close(self) -> None:
|
|
if self.__callback:
|
|
if self.__buf.tell() == 0:
|
|
# Empty file:
|
|
result = b""
|
|
else:
|
|
# Return the data without actually loading it into memory,
|
|
# relying on Python's buffer API and mmap(). mmap() just gives
|
|
# a view directly into the filesystem's memory cache, so it
|
|
# doesn't result in duplicate memory use.
|
|
self.__buf.seek(0, 0)
|
|
result = memoryview(
|
|
mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ)
|
|
)
|
|
self.__callback(result)
|
|
|
|
# We assign this to None here, because otherwise we can get into
|
|
# really tricky problems where the CPython interpreter dead locks
|
|
# because the callback is holding a reference to something which
|
|
# has a __del__ method. Setting this to None breaks the cycle
|
|
# and allows the garbage collector to do it's thing normally.
|
|
self.__callback = None
|
|
|
|
# Closing the temporary file releases memory and frees disk space.
|
|
# Important when caching big files.
|
|
self.__buf.close()
|
|
|
|
def read(self, amt: int | None = None) -> bytes:
|
|
data: bytes = self.__fp.read(amt)
|
|
if data:
|
|
# We may be dealing with b'', a sign that things are over:
|
|
# it's passed e.g. after we've already closed self.__buf.
|
|
self.__buf.write(data)
|
|
if self.__is_fp_closed():
|
|
self._close()
|
|
|
|
return data
|
|
|
|
def _safe_read(self, amt: int) -> bytes:
|
|
data: bytes = self.__fp._safe_read(amt) # type: ignore[attr-defined]
|
|
if amt == 2 and data == b"\r\n":
|
|
# urllib executes this read to toss the CRLF at the end
|
|
# of the chunk.
|
|
return data
|
|
|
|
self.__buf.write(data)
|
|
if self.__is_fp_closed():
|
|
self._close()
|
|
|
|
return data
|