1
0

summarize.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import re
  2. import asyncio
  3. from bs4 import BeautifulSoup
  4. from contextlib import suppress
  5. from aiohttp import ClientSession
  6. from aiogram_dialog import DialogManager
  7. from typing import Any
  8. from db import add_authors_and_paper, add_or_update_paper, check_paper
  9. base_url = "https://engine.scholarcy.com/api/"
  10. extract_url = "metadata/extract"
  11. highlights_url = "highlights/extract"
  12. summarize_endpoint = "https://summarizer.scholarcy.com/summarize"
  13. extract_endpoint = base_url + extract_url
  14. highlights_endpoint = base_url + highlights_url
  15. params = dict(
  16. external_metadata="false",
  17. parse_references="false",
  18. generate_summary="true",
  19. summary_engine="v4",
  20. replace_pronouns="false",
  21. strip_dialogue="false",
  22. summary_size="400",
  23. summary_percent="0",
  24. structured_summary="false",
  25. keyword_method="sgrank+acr",
  26. keyword_limit="25",
  27. abbreviation_method="schwartz",
  28. extract_claims="true",
  29. key_points="5",
  30. citation_contexts="false",
  31. inline_citation_links="false",
  32. extract_pico="false",
  33. extract_tables="false",
  34. extract_figures="true",
  35. require_captions="false",
  36. extract_sections="false",
  37. unstructured_content="false",
  38. include_markdown="true",
  39. extract_snippets="true",
  40. engine="v2",
  41. image_engine="v1+v2"
  42. )
  43. async def get_summary(cross_data: Any,
  44. paper_id: str,
  45. paper_url: str,
  46. synopsys=False,
  47. highlights=True,
  48. context_id: str = "qwe"):
  49. async def fetch_summary(paper_url: str, synopsys=False, highlights=False):
  50. pdf_url = paper_url.replace("abs", "pdf") + ".pdf"
  51. if highlights:
  52. url = highlights_endpoint
  53. else:
  54. url = extract_endpoint
  55. if synopsys:
  56. url = summarize_endpoint
  57. params["url"] = pdf_url
  58. async with ClientSession() as session:
  59. async with await session.get(url, params=params) as response:
  60. if response.ok:
  61. data = await response.json()
  62. if data.get("response"):
  63. return data["response"]
  64. else:
  65. return data
  66. else:
  67. try:
  68. data = {"code_error": response.status,
  69. "message": (await response.json()).get("message")}
  70. return data
  71. except Exception as e:
  72. data = {"code_error": response.status,
  73. "message": e}
  74. return data
  75. if paper := await check_paper(paper_id):
  76. if paper.highlights:
  77. data = {
  78. "id": paper.id_,
  79. "title": paper.title,
  80. "abstract": paper.abstract,
  81. "highlights": paper.highlights,
  82. "findings": paper.findings,
  83. "summary": paper.summary,
  84. "figures_url": paper.figures_url,
  85. "full_data": paper.full_data,
  86. # "authors": paper.authors,
  87. }
  88. else:
  89. data = await fetch_summary(paper_url, synopsys, highlights)
  90. if not data.get("code_error"):
  91. await add_or_update_paper(paper_id, data)
  92. data["id"] = paper.id_
  93. data["title"] = paper.title
  94. data["abstract"] = paper.abstract
  95. # data["authors"] = paper.authors
  96. else:
  97. data = await fetch_summary(paper_url, synopsys, highlights)
  98. if data.get("metadata"):
  99. data["authors"] = data["metadata"].get("author").split(",").strip()
  100. data["title"] = data["metadata"].get('title')
  101. data["abstract"] = data["metadata"].get("abstract")
  102. await add_authors_and_paper(paper_id, data)
  103. # await asyncio.sleep(1)
  104. cross_data.intent_queue[context_id] = "done"
  105. cross_data.context[context_id].dialog_data.update(data)
  106. return
  107. async def get_paper_desc(id_paper: str) -> dict | None:
  108. if paper_ := await check_paper(id_paper):
  109. paper = {
  110. "id_" : paper_.id_,
  111. "url" : f"https://arxiv.org/abs/{paper_.id_}",
  112. "title" : paper_.title,
  113. "abstract" : paper_.abstract,
  114. "authors": None
  115. }
  116. return paper
  117. else:
  118. async with ClientSession() as session:
  119. async with await session.get(f'https://arxiv.org/abs/{id_paper}') as request:
  120. if request.ok:
  121. soup = BeautifulSoup(await request.text(), features="xml")
  122. try:
  123. url = soup.find('meta', property='og:url').get('content') # type: ignore
  124. paper = {
  125. "id_": re.findall(r'arxiv.org\/(?:abs|pdf)\/(\d{4}\.\d{5}[v]?[\d]?)', url)[0], # type: ignore
  126. "url" : url, # type: ignore
  127. "title" : soup.find('meta', property='og:title').get('content'), # type: ignore
  128. "abstract" : soup.find('meta', property='og:description').get('content').replace('\n', ' '), # type: ignore
  129. "authors" : [name.text for name in soup.find("div", class_="authors").find_all("a")] # type: ignore
  130. }
  131. await add_authors_and_paper(paper["id_"], paper)
  132. return paper
  133. except TypeError:
  134. pass
  135. return None