file.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. import datetime
  2. import hashlib
  3. import tempfile
  4. import chardet
  5. import time
  6. import uuid
  7. from pathlib import Path
  8. from cachetools import TTLCache
  9. from flask import request, current_app
  10. from flask_login import login_required, current_user
  11. from flask_restful import Resource, marshal_with, fields
  12. from werkzeug.exceptions import NotFound
  13. from controllers.console import api
  14. from controllers.console.datasets.error import NoFileUploadedError, TooManyFilesError, FileTooLargeError, \
  15. UnsupportedFileTypeError
  16. from controllers.console.setup import setup_required
  17. from controllers.console.wraps import account_initialization_required
  18. from core.index.readers.html_parser import HTMLParser
  19. from core.index.readers.pdf_parser import PDFParser
  20. from core.index.readers.xlsx_parser import XLSXParser
  21. from extensions.ext_storage import storage
  22. from libs.helper import TimestampField
  23. from extensions.ext_database import db
  24. from models.model import UploadFile
  25. cache = TTLCache(maxsize=None, ttl=30)
  26. FILE_SIZE_LIMIT = 15 * 1024 * 1024 # 15MB
  27. ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx']
  28. PREVIEW_WORDS_LIMIT = 3000
  29. class FileApi(Resource):
  30. file_fields = {
  31. 'id': fields.String,
  32. 'name': fields.String,
  33. 'size': fields.Integer,
  34. 'extension': fields.String,
  35. 'mime_type': fields.String,
  36. 'created_by': fields.String,
  37. 'created_at': TimestampField,
  38. }
  39. @setup_required
  40. @login_required
  41. @account_initialization_required
  42. @marshal_with(file_fields)
  43. def post(self):
  44. # get file from request
  45. file = request.files['file']
  46. # check file
  47. if 'file' not in request.files:
  48. raise NoFileUploadedError()
  49. if len(request.files) > 1:
  50. raise TooManyFilesError()
  51. file_content = file.read()
  52. file_size = len(file_content)
  53. if file_size > FILE_SIZE_LIMIT:
  54. message = "({file_size} > {FILE_SIZE_LIMIT})"
  55. raise FileTooLargeError(message)
  56. extension = file.filename.split('.')[-1]
  57. if extension not in ALLOWED_EXTENSIONS:
  58. raise UnsupportedFileTypeError()
  59. # user uuid as file name
  60. file_uuid = str(uuid.uuid4())
  61. file_key = 'upload_files/' + current_user.current_tenant_id + '/' + file_uuid + '.' + extension
  62. # save file to storage
  63. storage.save(file_key, file_content)
  64. # save file to db
  65. config = current_app.config
  66. upload_file = UploadFile(
  67. tenant_id=current_user.current_tenant_id,
  68. storage_type=config['STORAGE_TYPE'],
  69. key=file_key,
  70. name=file.filename,
  71. size=file_size,
  72. extension=extension,
  73. mime_type=file.mimetype,
  74. created_by=current_user.id,
  75. created_at=datetime.datetime.utcnow(),
  76. used=False,
  77. hash=hashlib.sha3_256(file_content).hexdigest()
  78. )
  79. db.session.add(upload_file)
  80. db.session.commit()
  81. return upload_file, 201
  82. class FilePreviewApi(Resource):
  83. @setup_required
  84. @login_required
  85. @account_initialization_required
  86. def get(self, file_id):
  87. file_id = str(file_id)
  88. key = file_id + request.path
  89. cached_response = cache.get(key)
  90. if cached_response and time.time() - cached_response['timestamp'] < cache.ttl:
  91. return cached_response['response']
  92. upload_file = db.session.query(UploadFile) \
  93. .filter(UploadFile.id == file_id) \
  94. .first()
  95. if not upload_file:
  96. raise NotFound("File not found")
  97. # extract text from file
  98. extension = upload_file.extension
  99. if extension not in ALLOWED_EXTENSIONS:
  100. raise UnsupportedFileTypeError()
  101. with tempfile.TemporaryDirectory() as temp_dir:
  102. suffix = Path(upload_file.key).suffix
  103. filepath = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
  104. storage.download(upload_file.key, filepath)
  105. if extension == 'pdf':
  106. parser = PDFParser({'upload_file': upload_file})
  107. text = parser.parse_file(Path(filepath))
  108. elif extension in ['html', 'htm']:
  109. # Use BeautifulSoup to extract text
  110. parser = HTMLParser()
  111. text = parser.parse_file(Path(filepath))
  112. elif extension == 'xlsx':
  113. parser = XLSXParser()
  114. text = parser.parse_file(filepath)
  115. else:
  116. # ['txt', 'markdown', 'md']
  117. with open(filepath, "rb") as fp:
  118. data = fp.read()
  119. encoding = chardet.detect(data)['encoding']
  120. if encoding:
  121. text = data.decode(encoding=encoding).strip() if data else ''
  122. else:
  123. text = data.decode(encoding='utf-8').strip() if data else ''
  124. text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
  125. return {'content': text}
  126. api.add_resource(FileApi, '/files/upload')
  127. api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')