Parcourir la source

feat: add paper summarizer

metya il y a 2 ans
Parent
commit
2427cb0f35
9 fichiers modifiés avec 327 ajouts et 45 suppressions
  1. 1 1
      Dockerfile
  2. 5 1
      config.py
  3. 138 0
      develop.yml
  4. 92 0
      dialog.py
  5. 1 1
      docker-compose.yaml
  6. 1 1
      pyproject.toml
  7. 9 7
      requirements.txt
  8. 49 0
      summarize.py
  9. 31 34
      vanitybot.py

+ 1 - 1
Dockerfile

@@ -9,7 +9,7 @@ RUN apk add --no-cache --virtual .build-deps gcc musl-dev
 
 WORKDIR /app
 ADD prod-requirements.txt /app
-RUN pip install --no-cache-dir -r prod-requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
 RUN apk del .build-deps
 
 ADD . /app

+ 5 - 1
config.py

@@ -4,4 +4,8 @@ from dotenv import dotenv_values
 if env_var := dotenv_values('token'):
     API_TOKEN = env_var["API_TOKEN"]
 else:
-    API_TOKEN = getenv("API_TOKEN")
+    API_TOKEN = getenv("API_TOKEN")
+    
+    
+if __name__ == "__main__":
+    print(API_TOKEN)

+ 138 - 0
develop.yml

@@ -0,0 +1,138 @@
+name: vanity
+channels:
+  - conda-forge
+dependencies:
+  - aiodns=3.0.0
+  - aiohttp=3.8.3
+  - aiosignal=1.3.1
+  - appdirs=1.4.4
+  - appnope=0.1.3
+  - argcomplete=1.12.3
+  - asttokens=2.2.1
+  - async-timeout=4.0.2
+  - attrs=22.2.0
+  - backcall=0.2.0
+  - backports=1.0
+  - backports.functools_lru_cache=1.6.4
+  - beautifulsoup4=4.11.2
+  - brotlipy=0.7.0
+  - bzip2=1.0.8
+  - ca-certificates=2022.12.7
+  - cchardet=2.1.7
+  - certifi=2022.12.7
+  - cffi=1.15.1
+  - cfgv=3.3.1
+  - charset-normalizer=2.1.1
+  - click=8.1.3
+  - colorama=0.4.6
+  - comm=0.1.2
+  - commitizen=2.28.1
+  - cryptography=39.0.0
+  - cssselect=1.2.0
+  - debugpy=1.6.6
+  - decli=0.5.2
+  - decorator=5.1.1
+  - distlib=0.3.6
+  - executing=1.2.0
+  - fake-useragent=1.1.1
+  - filelock=3.9.0
+  - frozenlist=1.3.3
+  - icu=70.1
+  - identify=2.5.17
+  - idna=3.4
+  - importlib-metadata=6.0.0
+  - importlib_metadata=6.0.0
+  - ipykernel=6.21.1
+  - ipython=8.9.0
+  - jedi=0.18.2
+  - jinja2=3.1.2
+  - jupyter_client=8.0.2
+  - jupyter_core=5.2.0
+  - libcxx=14.0.6
+  - libffi=3.4.2
+  - libiconv=1.17
+  - libsodium=1.0.18
+  - libsqlite=3.40.0
+  - libuv=1.44.2
+  - libxml2=2.10.3
+  - libxslt=1.1.37
+  - libzlib=1.2.13
+  - lxml=4.9.2
+  - markupsafe=2.1.2
+  - matplotlib-inline=0.1.6
+  - multidict=6.0.4
+  - mypy=1.0.0
+  - mypy_extensions=1.0.0
+  - ncurses=6.3
+  - nest-asyncio=1.5.6
+  - nodeenv=1.7.0
+  - openssl=3.0.8
+  - packaging=21.3
+  - parse=1.19.0
+  - parso=0.8.3
+  - pexpect=4.8.0
+  - pickleshare=0.7.5
+  - pip=23.0
+  - platformdirs=2.6.2
+  - pre-commit=3.0.4
+  - prompt-toolkit=3.0.36
+  - prompt_toolkit=3.0.36
+  - psutil=5.9.4
+  - ptyprocess=0.7.0
+  - pure_eval=0.2.2
+  - pycares=4.0.0
+  - pycparser=2.21
+  - pyee=8.1.0
+  - pygments=2.14.0
+  - pyopenssl=23.0.0
+  - pyparsing=3.0.9
+  - pyppeteer=1.0.2
+  - pyquery=2.0.0
+  - pysocks=1.7.1
+  - python=3.11.0
+  - python-dateutil=2.8.2
+  - python-dotenv=0.21.1
+  - python_abi=3.11
+  - pyyaml=6.0
+  - pyzmq=25.0.0
+  - questionary=1.10.0
+  - readline=8.1.2
+  - requests=2.28.2
+  - requests-html=0.10.0
+  - setuptools=67.1.0
+  - six=1.16.0
+  - soupsieve=2.3.2.post1
+  - stack_data=0.6.2
+  - termcolor=1.1.0
+  - tk=8.6.12
+  - tomli=2.0.1
+  - tomlkit=0.11.6
+  - tornado=6.2
+  - tqdm=4.64.1
+  - traitlets=5.9.0
+  - typing=3.10.0.0
+  - typing-extensions=4.4.0
+  - typing_extensions=4.4.0
+  - tzdata=2022g
+  - ujson=5.7.0
+  - ukkonen=1.0.1
+  - urllib3=1.26.14
+  - uvloop=0.17.0
+  - virtualenv=20.18.0
+  - w3lib=2.1.1
+  - wcwidth=0.2.6
+  - websockets=10.4
+  - wheel=0.38.4
+  - xz=5.2.6
+  - yaml=0.2.5
+  - yarl=1.8.2
+  - zeromq=4.3.4
+  - zipp=3.12.1
+  - pip:
+      - aiogram==2.25.1
+      - aiogram-dialog==1.9.0
+      - babel==2.9.1
+      - cachetools==4.2.4
+      - magic-filter==1.0.9
+      - pytz==2022.7.1
+prefix: /opt/homebrew/Caskroom/mambaforge/base/envs/vanity

+ 92 - 0
dialog.py

@@ -0,0 +1,92 @@
+
+from typing import Any
+import operator
+from aiogram.dispatcher.filters.state import StatesGroup, State
+from aiogram.types import Message, CallbackQuery
+from aiogram_dialog import Window, Dialog, DialogManager, StartMode
+from aiogram_dialog.widgets.kbd import Radio
+from aiogram_dialog.widgets.text import Format
+from aiogram.types import ParseMode
+
+from summarize import get_paper_desc, get_key_moments, get_summary
+
+
+class MySG(StatesGroup):
+    main = State()
+
+buttons = [
+        ("Abstract", '1'),
+        ("Summary", '2'),
+        ("Key Moments", '3'),
+    ]
+
+async def get_data(dialog_manager: DialogManager, **kwargs):
+    data = dialog_manager.current_context()
+    item_id = data.widget_data.get('radio_buttons')
+    p = {"text": "OOOPS!"}
+    title = data.start_data["title"]
+    url = data.start_data["url"]
+    
+    if data.dialog_data.get('abs'):
+        abst = data.dialog_data.get('abs')
+    else:
+        data.dialog_data["abs"] = data.start_data["reply_message"]
+        abst = data.dialog_data.get('abs')
+
+    if item_id == "2":
+        if data.dialog_data.get("summary"):
+            summ = data.dialog_data["summary"]
+            p = {"text": f"{url}\n\n***{title}***\n\n{summ}"}
+    elif item_id == "3":
+        if data.dialog_data.get("key_moments"):
+            keys = data.dialog_data.get("key_moments")
+            p = {"text": f"{url}\n\n***{title}***\n\n{keys}"}
+    else:
+        p = {"text": abst}
+    return p
+
+
+async def on_button_selected(c: CallbackQuery, widget: Any, manager: DialogManager, item_id: str):
+    data = manager.current_context()
+    if item_id == "2":
+        if data.dialog_data.get('summary'):
+            pass
+        else:
+            await c.answer("Getting Summary, please wait")
+            summary = await get_summary(url = data.start_data["url"])
+            data.dialog_data["summary"] = summary
+    elif item_id == "3":
+        if data.dialog_data.get("key_moments"):
+            pass
+        else:
+            await c.answer("Getting Key Moments, please wait")
+            key_moments = await get_key_moments(url=data.start_data["url"])
+            data.dialog_data["key_moments"] = key_moments
+    else:
+        pass
+        
+    return {"text": item_id}
+
+
+
+buttons_kbd = Radio(
+    Format("✓ {item[0]}"),
+    Format("{item[0]}"),
+    id="radio_buttons",
+    item_id_getter=operator.itemgetter(1),
+    items=buttons,
+    on_click=on_button_selected,
+)
+
+dialog = Dialog(
+    Window(
+        Format("{text}"),
+        buttons_kbd,
+        state=MySG.main,
+        getter=get_data,
+        parse_mode=ParseMode.MARKDOWN,
+        # preview_data={"button": "1"}
+    )
+)
+
+

+ 1 - 1
docker-compose.yaml

@@ -4,4 +4,4 @@ services:
     build: .
     environment:
       - API_TOKEN
-    restart: always
+    restart: always

+ 1 - 1
pyproject.toml

@@ -7,7 +7,6 @@ authors = ["metya <metya.tm@gmail.com>"]
 [tool.poetry.dependencies]
 python = "^3.9"
 aiogram = "^2.10.1"
-logzero = "^1.5.0"
 cchardet = "^2.1.6"
 ujson = "^5.4.0"
 aiohttp = {extras = ["speedups"], version = "^3.7.4"}
@@ -25,6 +24,7 @@ pre-commit = "^2.10.1"
 name = "cz_conventional_commits"
 version = "0.1.0"
 tag_format = "$version"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"

+ 9 - 7
requirements.txt

@@ -1,7 +1,9 @@
-logzero==1.6.3
-requests==2.25.1
-aiogram==2.11.2
-attrs==20.3.0
-attr==0.3.1
-beautifulsoup4==4.9.3
-python-dotenv==0.16.0
+aiogram
+aiohttp
+aiodns
+ujson
+uvloop
+cchardet
+beautifulsoup4
+python-dotenv
+requests-html

+ 49 - 0
summarize.py

@@ -0,0 +1,49 @@
+from requests_html import AsyncHTMLSession
+from bs4 import BeautifulSoup
+from contextlib import suppress
+from requests import get
+
+def get_paper_desc(id_paper: str) -> tuple | None:
+    request = get(f'https://arxiv.org/abs/{id_paper}')
+    if request.ok:
+        soup = BeautifulSoup(request.content, features="lxml")
+        with suppress(TypeError): 
+            url = soup.find('meta', property='og:url').get('content')
+            title = soup.find('meta', property='og:title').get('content')
+            description = soup.find('meta', property='og:description').get('content').replace('\n', ' ')
+            return url, title, description
+    return None
+
+async def get_summary(url: str = "https://arxiv.org/abs/2102.12092v2") -> str:
+    url = url.replace("abs", "pdf")
+    async_session = AsyncHTMLSession()
+    async_response = await async_session.get(f"https://labs.kagi.com/ai/sum?url={url}.pdf")
+    await async_response.html.arender(sleep=5)
+    if res := async_response.html.find("p.description", first = True).text:
+        await async_session.close()
+        return res
+    else:
+        await async_response.html.arender(sleep=10)
+        if  res := async_response.html.find("p.description", first = True).text:
+            await async_session.close() 
+            return res
+        else:
+            await async_session.close()
+            return "Nothing to retrieve :("
+
+async def get_key_moments(url: str = "https://arxiv.org/abs/2102.12092v2") -> str:
+    url = url.replace("abs", "pdf")
+    async_session = AsyncHTMLSession()
+    async_response = await async_session.get(f"https://labs.kagi.com/ai/sum?url={url}.pdf&expand=1")
+    await async_response.html.arender(sleep=5)
+    if res := async_response.html.find("p.description", first = True).text:
+        await async_session.close() 
+        return res
+    else:
+        await async_response.html.arender(sleep=10)
+        if  res := async_response.html.find("p.description", first = True).text:
+            await async_session.close()
+            return res
+        else:
+            await async_session.close()
+            return "Nothing to retrieve :("

+ 31 - 34
vanitybot.py

@@ -1,41 +1,23 @@
 import re
 import logging
-from attr import __description__
-from logzero import logger
-from aiogram.types import message
-from aiogram.types.message import Message, ParseMode
+from aiogram.contrib.fsm_storage.memory import MemoryStorage
 from aiogram import Bot, Dispatcher, executor, types
+from aiogram_dialog import DialogManager, DialogRegistry, StartMode
 from config import API_TOKEN
-from bs4 import BeautifulSoup
-from requests import get
-from contextlib import suppress
-
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-
-# utils funtions
-def get_paper_desc(id_paper: str) -> tuple:
-    
-    request = get(f'https://arxiv.org/abs/{id_paper}')
-    if request.ok:
-        soup = BeautifulSoup(request.content)
-        with suppress(TypeError): 
-            url = soup.find('meta', property='og:url').get('content')
-            title = soup.find('meta', property='og:title').get('content')
-            description = soup.find('meta', property='og:description').get('content').replace('\n', ' ')
-            return url, title, description
-    
-    return None
+from dialog import dialog, MySG
+from summarize import get_paper_desc
 
 # Initialize bot and dispatcher
 bot = Bot(token=API_TOKEN)
-dp = Dispatcher(bot)
+dp = Dispatcher(bot, storage=MemoryStorage())
+registry = DialogRegistry(dp)
+registry.register(dialog)
 
 help_message = "Hello!\n\n\
 Send me a link paper from arxiv.org and \
-I'll send you back snipet of paper and arxiv-vanity.com mobile friendly link!\n\
+I'll send you back snippet of paper and arxiv-vanity.com mobile friendly link!\n\
 Or add me to chat and I'll be watching the arxiv link and \
-reply to them with fancy axiv-vanity links."
+reply to them with fancy arxiv-vanity links."
 
 @dp.message_handler(commands=['start'])
 async def process_start_command(message: types.Message):
@@ -47,20 +29,35 @@ async def process_help_command(message: types.Message):
     await message.reply(help_message)
 
 
-@dp.message_handler(regexp='arxiv.org\/(?:abs|pdf)\/\d{4}\.\d{5}')
-async def vanitify(message: types.Message):
+@dp.message_handler(regexp=r'arxiv.org\/(?:abs|pdf)\/\d{4}\.\d{5}')
+async def vanitify(message: types.Message, dialog_manager: DialogManager):
     papers_ids = re.findall(r'arxiv.org\/(?:abs|pdf)\/(\d{4}\.\d{5})', message.text)
-    
+
     for id_ in papers_ids:
         reply_message = f"[Here you can read the paper in mobile friendly way](https://www.arxiv-vanity.com/papers/{id_})"
-
+        data = {
+            "id": id_,
+            "reply_message": reply_message,
+            "url": None,
+            "title": None,
+            "abs": None
+        }
         if desc := get_paper_desc(id_):
             url, title, description = desc
             reply_message = f'{url}\n\n***{title}***\n\n{description}\n\n{reply_message}'
-        
-            
-        await message.reply(reply_message, parse_mode=ParseMode.MARKDOWN)
+            data.update({
+                "reply_message": reply_message,
+                "url": url,
+                "title": title,
+                "abs": description
+                })
+        else:
+            reply_message = f'Something went wrong. Can not reach arxiv.com :('
+            data["reply_message"] = reply_message
+
+        await dialog_manager.start(MySG.main, mode=StartMode.NEW_STACK, data=data)
 
 
 if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
     executor.start_polling(dp, skip_updates=True)