Merge branch 'main' into feat/rag-2

This commit is contained in:
twwu
2025-08-21 09:43:51 +08:00
131 changed files with 3855 additions and 1081 deletions

View File

@ -1,17 +0,0 @@
import re
def get_image_upload_file_ids(content):
pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)"
matches = re.findall(pattern, content)
image_upload_file_ids = []
for match in matches:
if match[1] == "file-preview":
content_pattern = r"files/([^/]+)/file-preview"
else:
content_pattern = r"files/([^/]+)/image-preview"
content_match = re.search(content_pattern, match[0])
if content_match:
image_upload_file_id = content_match.group(1)
image_upload_file_ids.append(image_upload_file_id)
return image_upload_file_ids

View File

@ -80,14 +80,14 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str:
else:
content = response.text
article = extract_using_readabilipy(content)
article = extract_using_readability(content)
if not article.text:
return ""
res = FULL_TEMPLATE.format(
title=article.title,
author=article.auther,
author=article.author,
text=article.text,
)
@ -97,15 +97,15 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str:
@dataclass
class Article:
title: str
auther: str
author: str
text: Sequence[dict]
def extract_using_readabilipy(html: str):
def extract_using_readability(html: str):
json_article: dict[str, Any] = simple_json_from_html_string(html, use_readability=True)
article = Article(
title=json_article.get("title") or "",
auther=json_article.get("byline") or "",
author=json_article.get("byline") or "",
text=json_article.get("plain_text") or [],
)
@ -113,7 +113,7 @@ def extract_using_readabilipy(html: str):
def get_image_upload_file_ids(content):
pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)"
pattern = r"!\[image\]\((https?://.*?(file-preview|image-preview))\)"
matches = re.findall(pattern, content)
image_upload_file_ids = []
for match in matches: