From 3126543321c5180be8a81ffc3e5f898a55cffe85 Mon Sep 17 00:00:00 2001 From: Andrew Ferrazzutti Date: Tue, 6 Apr 2021 01:54:21 -0400 Subject: [PATCH] Deduplicate emotes --- matrix_puppeteer_line/db/__init__.py | 5 +-- matrix_puppeteer_line/db/media.py | 51 ++++++++++++++++++++++++++++ matrix_puppeteer_line/db/upgrade.py | 10 +++++- matrix_puppeteer_line/portal.py | 44 +++++++++++++++++------- 4 files changed, 94 insertions(+), 16 deletions(-) create mode 100644 matrix_puppeteer_line/db/media.py diff --git a/matrix_puppeteer_line/db/__init__.py b/matrix_puppeteer_line/db/__init__.py index c52a762..9368399 100644 --- a/matrix_puppeteer_line/db/__init__.py +++ b/matrix_puppeteer_line/db/__init__.py @@ -5,11 +5,12 @@ from .user import User from .puppet import Puppet from .portal import Portal from .message import Message +from .media import Media def init(db: Database) -> None: - for table in (User, Puppet, Portal, Message): + for table in (User, Puppet, Portal, Message, Media): table.db = db -__all__ = ["upgrade_table", "User", "Puppet", "Portal", "Message"] +__all__ = ["upgrade_table", "User", "Puppet", "Portal", "Message", "Media"] diff --git a/matrix_puppeteer_line/db/media.py b/matrix_puppeteer_line/db/media.py new file mode 100644 index 0000000..24c71b3 --- /dev/null +++ b/matrix_puppeteer_line/db/media.py @@ -0,0 +1,51 @@ +# matrix-puppeteer-line - A very hacky Matrix-LINE bridge based on running LINE's Chrome extension in Puppeteer +# Copyright (C) 2020-2021 Tulir Asokan, Andrew Ferrazzutti +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +from typing import Optional, ClassVar, TYPE_CHECKING + +from attr import dataclass + +from mautrix.types import ContentURI +from mautrix.util.async_db import Database + +fake_db = Database("") if TYPE_CHECKING else None + + +@dataclass +class Media: + db: ClassVar[Database] = fake_db + + media_id: str + mxc: ContentURI + # TODO Consider whether mime_type, file_name, and size are needed. + + async def insert(self) -> None: + q = ("INSERT INTO media (media_id, mxc) " + "VALUES ($1, $2)") + await self.db.execute(q, self.media_id, self.mxc) + + async def update(self) -> None: + q = ("UPDATE media SET mxc=$2 " + "WHERE media_id=$1") + await self.db.execute(q, self.media_id, self.mxc) + + @classmethod + async def get_by_id(cls, media_id: str) -> Optional[ContentURI]: + q = ("SELECT media_id, mxc " + "FROM media WHERE media_id=$1") + row = await cls.db.fetchrow(q, media_id) + if not row: + return None + return cls(**row) diff --git a/matrix_puppeteer_line/db/upgrade.py b/matrix_puppeteer_line/db/upgrade.py index 5f796a0..09236e3 100644 --- a/matrix_puppeteer_line/db/upgrade.py +++ b/matrix_puppeteer_line/db/upgrade.py @@ -59,4 +59,12 @@ async def upgrade_avatars(conn: Connection) -> None: await conn.execute("""ALTER TABLE portal ADD COLUMN IF NOT EXISTS icon_path TEXT, ADD COLUMN IF NOT EXISTS icon_mxc TEXT - """) \ No newline at end of file + """) + + +@upgrade_table.register(description="Deduplicated media") +async def upgrade_media(conn: Connection) -> None: + await conn.execute("""CREATE TABLE IF NOT EXISTS media ( + media_id TEXT PRIMARY KEY, + mxc TEXT NOT NULL + )""") \ No newline at end of file diff --git a/matrix_puppeteer_line/portal.py b/matrix_puppeteer_line/portal.py index 4a1dbcb..d112378 100644 --- a/matrix_puppeteer_line/portal.py +++ b/matrix_puppeteer_line/portal.py @@ -33,7 +33,7 @@ from mautrix.errors import MatrixError from mautrix.util.simple_lock import SimpleLock from mautrix.util.network_retry import call_with_net_retry -from .db import Portal as DBPortal, Message as DBMessage +from .db import Portal as DBPortal, Message as DBMessage, Media as DBMedia from .config import Config from .rpc import ChatInfo, Participant, Message, Client, PathImage from . import user as u, puppet as p, matrix as m @@ -212,6 +212,7 @@ class Portal(DBPortal, BasePortal): event_id = None if evt.image_url: + # TODO Deduplicate stickers, but only if encryption is disabled content = await self._handle_remote_photo(source, intent, evt) event_id = await self._send_message(intent, content, timestamp=evt.timestamp) elif evt.html and not evt.html.isspace(): @@ -244,21 +245,22 @@ class Portal(DBPortal, BasePortal): if msg_html: msg_html += chunk["data"] elif ctype == "img": - if not msg_html: - msg_html = msg_text - cclass = chunk["class"] if cclass == "emojione": alt = chunk["alt"] + media_id = None else: - alt = f':{"?" if "alt" not in chunk else "".join(filter(lambda char: char.isprintable(), chunk["alt"]))}:' + alt = "".join(filter(lambda char: char.isprintable(), chunk["alt"])).strip() + alt = f':{alt if alt else "n/a"}:' + media_id = f'{chunk.get("data-stickon-pkg-cd", 0)}/{chunk.get("data-stickon-stk-cd", 0)}' + # NOTE Not encrypting content linked to by HTML tags + if not self.encrypted: + media_mxc = await self._get_mxc_for_remote_media(source, intent, chunk["src"], media_id) + if not msg_html: + msg_html = msg_text + msg_html += f'{alt}' msg_text += alt - # TODO Make a standalone function for this, and cache mxc in DB - # ID is some combination of data-stickon-pkg-cd, data-stickon-stk-cd, src - resp = await source.client.read_image(chunk["src"]) - media_info = await self._reupload_remote_media(resp.data, intent, resp.mime) - msg_html += f'{alt}' content = TextMessageEventContent( msgtype=MessageType.TEXT, @@ -279,9 +281,25 @@ class Portal(DBPortal, BasePortal): msgtype=MessageType.IMAGE, body=media_info.file_name, info=ImageInfo(mimetype=media_info.mime_type, size=media_info.size)) + async def _get_mxc_for_remote_media(self, source: 'u.User', intent: IntentAPI, + media_url: str, media_id: Optional[str] = None + ) -> ContentURI: + if not media_id: + media_id = media_url + media_info = await DBMedia.get_by_id(media_id) + if not media_info: + self.log.debug(f"Did not find existing mxc URL for {media_id}, uploading media now") + resp = await source.client.read_image(media_url) + media_info = await self._reupload_remote_media(resp.data, intent, resp.mime, disable_encryption=True) + await DBMedia(media_id=media_id, mxc=media_info.mxc).insert() + self.log.debug(f"Uploaded media as {media_info.mxc}") + else: + self.log.debug(f"Found existing mxc URL for {media_id}: {media_info.mxc}") + return media_info.mxc + async def _reupload_remote_media(self, data: bytes, intent: IntentAPI, - mime_type: str = None, file_name: str = None - ) -> ReuploadedMediaInfo: + mime_type: str = None, file_name: str = None, + disable_encryption: bool = True) -> ReuploadedMediaInfo: if not mime_type: mime_type = magic.from_buffer(data, mime=True) upload_mime_type = mime_type @@ -290,7 +308,7 @@ class Portal(DBPortal, BasePortal): upload_file_name = file_name decryption_info = None - if self.encrypted and encrypt_attachment: + if self.encrypted and encrypt_attachment and not disable_encryption: data, decryption_info = encrypt_attachment(data) upload_mime_type = "application/octet-stream" upload_file_name = None