470 lines
18 KiB
Python
Executable File
470 lines
18 KiB
Python
Executable File
import base64
|
||
import os
|
||
import re
|
||
import mysql.connector
|
||
import hashlib
|
||
from typing import List, Tuple, Dict, Optional, Any
|
||
from readmdict import MDX, MDD
|
||
from bs4 import BeautifulSoup, Tag
|
||
import json
|
||
from mysql.connector import Error
|
||
|
||
from backend.app.admin.schema.dict import Example, Frequency, Pronunciation, FamilyItem, WordFamily, \
|
||
WordMetaData, Sense, Definition, Topic, CrossReference, DictEntry, Etymology, EtymologyItem
|
||
|
||
|
||
class DictionaryParser:
|
||
def __init__(self, db_config: Dict):
|
||
"""初始化数据库连接"""
|
||
self.db_config = db_config
|
||
self.conn = None
|
||
self.connect_db()
|
||
|
||
def connect_db(self):
|
||
"""连接到MySQL数据库"""
|
||
try:
|
||
self.conn = mysql.connector.connect(**self.db_config)
|
||
except Error as e:
|
||
print(f"数据库连接失败: {e}")
|
||
raise
|
||
|
||
def parse_mdx_mdd(self, mdx_path: str, mdd_path: str = None) -> None:
|
||
"""解析MDX和MDD文件"""
|
||
try:
|
||
# 解析MDX文件
|
||
entries, media_references = self.parse_mdx_file_mdict(mdx_path)
|
||
|
||
# 保存词汇条目
|
||
entry_ids = self.save_entries(entries)
|
||
|
||
# 如果有MDD文件,解析媒体文件
|
||
if mdd_path and os.path.exists(mdd_path):
|
||
self.parse_mdd_file(mdd_path, media_references, entry_ids)
|
||
else:
|
||
print("未提供MDD文件或文件不存在")
|
||
|
||
print(f"解析完成,共处理 {len(entries)} 个词汇条目")
|
||
|
||
except Error as e:
|
||
print(f"解析词典文件失败: {e}")
|
||
raise
|
||
|
||
def parse_mdx_file_mdict(self, mdx_path: str) -> Tuple[List[Tuple[str, str]], List[Dict]]:
|
||
"""使用 mdict_reader 解析 MDX 文件"""
|
||
print(f"正在解析MDX文件: {mdx_path}")
|
||
|
||
try:
|
||
mdx = MDX(mdx_path)
|
||
entries = []
|
||
media_references = []
|
||
|
||
for key, value in mdx.items():
|
||
word = key.decode('utf-8') if isinstance(key, bytes) else str(key)
|
||
definition = value.decode('utf-8') if isinstance(value, bytes) else str(value)
|
||
|
||
if word and definition:
|
||
entries.append((word, definition))
|
||
# 提取媒体文件引用
|
||
media_refs = self.extract_media_references(definition, word)
|
||
media_references.extend(media_refs)
|
||
|
||
return entries, media_references
|
||
|
||
except Error as e:
|
||
print(f"解析MDX文件失败: {e}")
|
||
raise
|
||
|
||
def parse_mdd_file(self, mdd_path: str, media_references: List[Dict], entry_ids: Dict[str, int]) -> None:
|
||
"""解析MDD文件中的媒体资源 - 使用 mdict_reader"""
|
||
print(f"正在解析MDD文件: {mdd_path}")
|
||
|
||
try:
|
||
# 使用 mdict_reader 解析 MDD 文件
|
||
mdd = MDD(mdd_path)
|
||
|
||
# 创建文件名到媒体数据的映射
|
||
dict_media = {}
|
||
for key, value in mdd.items():
|
||
filename = key.decode('utf-8') if isinstance(key, bytes) else str(key)
|
||
# 确保文件名格式统一
|
||
filename = filename.replace('\\', '/').lstrip('/')
|
||
dict_media[filename] = value
|
||
|
||
# 保存媒体文件
|
||
self.save_dict_media(dict_media, media_references, entry_ids)
|
||
|
||
except Error as e:
|
||
print(f"解析MDD文件失败: {e}")
|
||
raise
|
||
|
||
def extract_media_references(self, definition: str, word: str) -> List[Dict]:
|
||
"""从定义中提取媒体文件引用"""
|
||
media_refs = []
|
||
|
||
# 提取音频文件引用 - 更通用的模式,匹配 sound:// 或 href="sound://..."
|
||
# 这个模式应该能覆盖 aeroplane.txt 中的 sound://media/english/... 链接
|
||
audio_patterns = [
|
||
r'sound://([^"\s>]+\.mp3)', # 直接 sound:// 开头,后跟非空格/"/>字符直到 .mp3
|
||
r'href\s*=\s*["\']sound://([^"\'>]+\.mp3)["\']', # href="sound://..."
|
||
r'href\s*=\s*["\']sound://([^"\'>]+)["\']', # 更宽松的 href="sound://...",不一定以.mp3结尾
|
||
r'data-src-mp3\s*=\s*["\']sound://([^"\'>]+\.mp3)["\']', # data-src-mp3="sound://..."
|
||
r'data-src-mp3\s*=\s*["\']([^"\'>]+\.mp3)["\']', # data-src-mp3="..." (相对路径)
|
||
r'audio\s*=\s*["\']([^"\']+)["\']', # audio="..."
|
||
]
|
||
|
||
for pattern in audio_patterns:
|
||
matches = re.findall(pattern, definition, re.IGNORECASE)
|
||
for match in matches:
|
||
# 清理可能的多余字符(如结尾的引号或空格,虽然正则应该已经避免了)
|
||
clean_filename = match.strip()#.rstrip('"\'')
|
||
if clean_filename:
|
||
media_refs.append({
|
||
'filename': clean_filename,
|
||
'type': 'audio',
|
||
'word': word
|
||
})
|
||
|
||
# 提取图片文件引用
|
||
image_patterns = [
|
||
r'<img[^>]*src\s*=\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|bmp))["\']', # src="..."
|
||
r'\[image:([^\]]+\.(?:jpg|jpeg|png|gif|bmp))\]', # [image:...]
|
||
r'src\s*=\s*["\']([^"\']+\.(?:jpg|jpeg|png|gif|bmp))["\']' # 更宽松的 src="..."
|
||
]
|
||
|
||
for pattern in image_patterns:
|
||
matches = re.findall(pattern, definition, re.IGNORECASE)
|
||
for match in matches:
|
||
# 清理可能的多余字符
|
||
clean_filename = match.strip()#.rstrip('"\'')
|
||
if clean_filename:
|
||
media_refs.append({
|
||
'filename': clean_filename,
|
||
'type': 'image',
|
||
'word': word
|
||
})
|
||
|
||
return media_refs
|
||
|
||
def save_entries(self, entries: List[Tuple[str, str]]) -> Dict[str, int]:
|
||
"""保存词汇条目到数据库,并更新 details 字段"""
|
||
from mysql.connector import Error
|
||
import hashlib
|
||
|
||
cursor = self.conn.cursor(dictionary=True)
|
||
entry_ids = {}
|
||
|
||
for word, definition in entries:
|
||
try:
|
||
# 检查数据库中是否已存在该词条
|
||
cursor.execute('SELECT id, definition, details FROM dict_entry WHERE word = %s', (word,))
|
||
existing_record = cursor.fetchone()
|
||
|
||
metadata = None
|
||
existing_details = None
|
||
final_definition = definition # 默认使用当前 definition
|
||
|
||
# 如果存在现有记录
|
||
if existing_record:
|
||
entry_id, existing_definition, existing_details_json = existing_record['id'], existing_record['definition'], existing_record['details']
|
||
|
||
# 获取现有的 details
|
||
if existing_details_json:
|
||
try:
|
||
existing_details = WordMetaData(**existing_details_json)
|
||
except:
|
||
existing_details = None
|
||
|
||
# 如果当前 definition 是以 @@@ 开头的引用链接
|
||
if definition.startswith('@@@'):
|
||
# 保留现有的 definition,只更新 details 中的 ref_link
|
||
final_definition = existing_definition # 保持原有的 definition
|
||
|
||
# 提取新的 @@@ 链接
|
||
lines = definition.split('\n')
|
||
new_ref_links = []
|
||
for line in lines:
|
||
if line.startswith('@@@'):
|
||
link = line[3:].strip()
|
||
if link:
|
||
new_ref_links.append(link)
|
||
else:
|
||
break
|
||
|
||
# 合并链接信息
|
||
if new_ref_links:
|
||
if existing_details:
|
||
# 如果已有 details,合并 ref_link
|
||
if existing_details.ref_link:
|
||
# 合并现有链接和新链接,去重但保持顺序
|
||
combined_links = existing_details.ref_link[:]
|
||
for link in new_ref_links:
|
||
if link not in combined_links:
|
||
combined_links.append(link)
|
||
else:
|
||
combined_links = new_ref_links
|
||
else:
|
||
combined_links = new_ref_links
|
||
|
||
# 更新 details
|
||
if existing_details:
|
||
metadata = existing_details.model_copy(update={"ref_link": combined_links})
|
||
else:
|
||
metadata = WordMetaData(ref_link=combined_links)
|
||
|
||
# 如果是新词条或需要更新 details
|
||
if not existing_record or metadata:
|
||
# 如果是新词条,创建默认 metadata
|
||
if not existing_record:
|
||
metadata = WordMetaData()
|
||
|
||
# 准备 details 数据
|
||
details_dict = metadata.model_dump() if metadata else None
|
||
|
||
if existing_record:
|
||
# 更新现有记录
|
||
cursor.execute('''
|
||
UPDATE dict_entry
|
||
SET definition = %s, details = %s
|
||
WHERE word = %s
|
||
''', (final_definition, json.dumps(details_dict) if details_dict else None, word))
|
||
entry_id = existing_record['id']
|
||
else:
|
||
# 插入新记录
|
||
cursor.execute('''
|
||
INSERT INTO dict_entry (word, definition, details)
|
||
VALUES (%s, %s, %s)
|
||
''', (word, final_definition, json.dumps(details_dict) if details_dict else None))
|
||
entry_id = cursor.lastrowid
|
||
|
||
else:
|
||
# 保持现有记录不变
|
||
entry_id = existing_record['id']
|
||
|
||
entry_ids[word] = entry_id
|
||
|
||
except Error as e:
|
||
print(f"保存词条 '{word}' 时出错: {e}")
|
||
|
||
self.conn.commit()
|
||
cursor.close()
|
||
return entry_ids
|
||
|
||
def parse_definition_to_metadata(self, html_str: str) -> Tuple[Dict, List[Dict]]:
|
||
"""解析HTML定义字符串,提取元数据"""
|
||
soup = BeautifulSoup(html_str, 'html.parser')
|
||
|
||
# 提取发音
|
||
pronunciations = []
|
||
pron_links = soup.find_all('a', class_='pronounce')
|
||
for link in pron_links:
|
||
pron_type = link.get('data-rel', '')
|
||
pron_url = link.get('href', '')
|
||
if pron_type and pron_url:
|
||
pronunciations.append(Pronunciation(type=pron_type, url=pron_url))
|
||
|
||
# 提取词性
|
||
pos_elements = soup.find_all('span', class_='pos')
|
||
pos_list = [pos.get_text().strip() for pos in pos_elements]
|
||
|
||
# 提取释义
|
||
definitions = []
|
||
sense_elements = soup.find_all('span', class_='def')
|
||
for sense in sense_elements:
|
||
definition_text = sense.get_text().strip()
|
||
if definition_text:
|
||
definitions.append(Definition(text=definition_text))
|
||
|
||
# 提取例句
|
||
examples = []
|
||
example_elements = soup.find_all('span', class_='example')
|
||
for example in example_elements:
|
||
example_text = example.get_text().strip()
|
||
if example_text:
|
||
examples.append(Example(text=example_text))
|
||
|
||
# 构建元数据
|
||
metadata = {
|
||
"pronunciations": [p.model_dump() for p in pronunciations],
|
||
"parts_of_speech": pos_list,
|
||
"definitions": [d.model_dump() for d in definitions],
|
||
"examples": [e.model_dump() for e in examples]
|
||
}
|
||
|
||
# 提取媒体信息
|
||
media_info = []
|
||
img_elements = soup.find_all('img')
|
||
for img in img_elements:
|
||
src = img.get('src', '')
|
||
if src:
|
||
media_info.append({
|
||
'type': 'image',
|
||
'src': src
|
||
})
|
||
|
||
return metadata, media_info
|
||
|
||
def save_dict_media(self, dict_media: Dict[str, bytes], media_references: List[Dict], entry_ids: Dict[str, int]) -> None:
|
||
"""保存词典媒体文件到数据库"""
|
||
from mysql.connector import Error
|
||
import hashlib
|
||
|
||
cursor = self.conn.cursor(dictionary=True)
|
||
|
||
try:
|
||
for media_ref in media_references:
|
||
word = media_ref['word']
|
||
filename = media_ref['filename']
|
||
file_type = media_ref['type']
|
||
|
||
# 查找对应的 entry_id
|
||
entry_id = entry_ids.get(word)
|
||
if not entry_id:
|
||
continue
|
||
|
||
# 查找文件数据
|
||
# 处理文件名,确保与 dict_media 中的键匹配
|
||
normalized_filename = filename.replace('\\', '/').lstrip('/')
|
||
file_data = dict_media.get(normalized_filename)
|
||
if not file_data:
|
||
# 尝试其他可能的文件名变体
|
||
alt_filename = filename.lstrip('/')
|
||
file_data = dict_media.get(alt_filename)
|
||
if not file_data:
|
||
print(f"警告: 找不到媒体文件 {filename} 的数据")
|
||
continue
|
||
|
||
# 计算文件哈希
|
||
file_hash = hashlib.md5(file_data).hexdigest()
|
||
|
||
# 检查数据库中是否已存在相同的文件
|
||
cursor.execute('''
|
||
SELECT id FROM dict_media
|
||
WHERE file_hash = %s AND file_type = %s
|
||
''', (file_hash, file_type))
|
||
existing_record = cursor.fetchone()
|
||
|
||
if existing_record:
|
||
# 如果文件已存在,只需关联到当前词条
|
||
media_id = existing_record['id']
|
||
cursor.execute('''
|
||
INSERT IGNORE INTO dict_entry_media (entry_id, media_id)
|
||
VALUES (%s, %s)
|
||
''', (entry_id, media_id))
|
||
else:
|
||
# 插入新文件记录
|
||
cursor.execute('''
|
||
INSERT INTO dict_media (filename, file_type, file_data, file_hash)
|
||
VALUES (%s, %s, %s, %s)
|
||
''', (filename, file_type, file_data, file_hash))
|
||
media_id = cursor.lastrowid
|
||
|
||
# 关联到词条
|
||
cursor.execute('''
|
||
INSERT IGNORE INTO dict_entry_media (entry_id, media_id)
|
||
VALUES (%s, %s)
|
||
''', (entry_id, media_id))
|
||
|
||
# 提取详细信息(如果有的话)
|
||
details = {}
|
||
if file_type == 'image':
|
||
# 对于图片,可以提取一些基本信息
|
||
details['size'] = len(file_data)
|
||
# 这里可以添加更多图片处理逻辑
|
||
|
||
# 更新媒体记录的详细信息
|
||
if details:
|
||
cursor.execute('''
|
||
UPDATE dict_media
|
||
SET details = %s
|
||
WHERE id = %s
|
||
''', (json.dumps(details), media_id))
|
||
|
||
except Error as e:
|
||
print(f"保存媒体文件时出错: {e}")
|
||
|
||
self.conn.commit()
|
||
cursor.close()
|
||
|
||
def export_media_files(self, export_dir: str) -> None:
|
||
"""导出媒体文件到本地目录"""
|
||
cursor = self.conn.cursor(dictionary=True)
|
||
|
||
try:
|
||
# 创建导出目录
|
||
os.makedirs(export_dir, exist_ok=True)
|
||
|
||
# 查询所有媒体文件
|
||
cursor.execute('SELECT id, filename, file_type, file_data FROM dict_media')
|
||
media_records = cursor.fetchall()
|
||
|
||
for record in media_records:
|
||
media_id, filename, file_type, file_data = record['id'], record['filename'], record['file_type'], record['file_data']
|
||
if file_data:
|
||
# 确保文件名安全
|
||
safe_filename = self._sanitize_filename(filename)
|
||
file_path = os.path.join(export_dir, safe_filename)
|
||
|
||
# 写入文件
|
||
with open(file_path, 'wb') as f:
|
||
f.write(file_data)
|
||
|
||
print(f"导出媒体文件: {file_path}")
|
||
|
||
except Error as e:
|
||
print(f"导出媒体文件时出错: {e}")
|
||
|
||
cursor.close()
|
||
|
||
def _sanitize_filename(self, filename: str) -> str:
|
||
"""清理文件名,确保安全"""
|
||
# 移除或替换不安全的字符
|
||
unsafe_chars = '<>:"/\\|?*'
|
||
for char in unsafe_chars:
|
||
filename = filename.replace(char, '_')
|
||
return filename
|
||
|
||
def close(self):
|
||
"""关闭数据库连接"""
|
||
if self.conn:
|
||
self.conn.close()
|
||
|
||
|
||
# 使用示例
|
||
def main():
|
||
# 数据库配置
|
||
db_config = {
|
||
'host': 'localhost',
|
||
'database': 'mysql',
|
||
'user': 'root',
|
||
'password': 'root',
|
||
'port': 3306
|
||
}
|
||
|
||
# 文件路径
|
||
mdx_path = './LDOCE5.mdx'
|
||
mdd_path = './LDOCE5.mdd' # 可选
|
||
|
||
# 创建解析器实例
|
||
parser = DictionaryParser(db_config)
|
||
|
||
try:
|
||
# with open('./exported_media/kernel.html', 'r', encoding='utf-8') as file:
|
||
# html_str = file.read()
|
||
# de,image_info = parser.parse_definition_to_metadata(html_str)
|
||
# print(de)
|
||
|
||
# 解析词典文件
|
||
parser.parse_mdx_mdd(mdx_path, mdd_path)
|
||
|
||
# 可选:导出媒体文件到本地目录
|
||
# parser.export_media_files('./exported_media')
|
||
|
||
except Error as e:
|
||
print(f"解析过程中出现错误: {e}")
|
||
finally:
|
||
parser.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|