file.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import datetime
  2. import hashlib
  3. import tempfile
  4. import time
  5. import uuid
  6. from pathlib import Path
  7. from cachetools import TTLCache
  8. from flask import request, current_app
  9. from flask_login import login_required, current_user
  10. from flask_restful import Resource, marshal_with, fields
  11. from werkzeug.exceptions import NotFound
  12. from controllers.console import api
  13. from controllers.console.datasets.error import NoFileUploadedError, TooManyFilesError, FileTooLargeError, \
  14. UnsupportedFileTypeError
  15. from controllers.console.setup import setup_required
  16. from controllers.console.wraps import account_initialization_required
  17. from core.index.readers.html_parser import HTMLParser
  18. from core.index.readers.pdf_parser import PDFParser
  19. from extensions.ext_storage import storage
  20. from libs.helper import TimestampField
  21. from extensions.ext_database import db
  22. from models.model import UploadFile
  23. cache = TTLCache(maxsize=None, ttl=30)
  24. FILE_SIZE_LIMIT = 15 * 1024 * 1024 # 15MB
  25. ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm']
  26. PREVIEW_WORDS_LIMIT = 3000
  27. class FileApi(Resource):
  28. file_fields = {
  29. 'id': fields.String,
  30. 'name': fields.String,
  31. 'size': fields.Integer,
  32. 'extension': fields.String,
  33. 'mime_type': fields.String,
  34. 'created_by': fields.String,
  35. 'created_at': TimestampField,
  36. }
  37. @setup_required
  38. @login_required
  39. @account_initialization_required
  40. @marshal_with(file_fields)
  41. def post(self):
  42. # get file from request
  43. file = request.files['file']
  44. # check file
  45. if 'file' not in request.files:
  46. raise NoFileUploadedError()
  47. if len(request.files) > 1:
  48. raise TooManyFilesError()
  49. file_content = file.read()
  50. file_size = len(file_content)
  51. if file_size > FILE_SIZE_LIMIT:
  52. message = "({file_size} > {FILE_SIZE_LIMIT})"
  53. raise FileTooLargeError(message)
  54. extension = file.filename.split('.')[-1]
  55. if extension not in ALLOWED_EXTENSIONS:
  56. raise UnsupportedFileTypeError()
  57. # user uuid as file name
  58. file_uuid = str(uuid.uuid4())
  59. file_key = 'upload_files/' + current_user.current_tenant_id + '/' + file_uuid + '.' + extension
  60. # save file to storage
  61. storage.save(file_key, file_content)
  62. # save file to db
  63. config = current_app.config
  64. upload_file = UploadFile(
  65. tenant_id=current_user.current_tenant_id,
  66. storage_type=config['STORAGE_TYPE'],
  67. key=file_key,
  68. name=file.filename,
  69. size=file_size,
  70. extension=extension,
  71. mime_type=file.mimetype,
  72. created_by=current_user.id,
  73. created_at=datetime.datetime.utcnow(),
  74. used=False,
  75. hash=hashlib.sha3_256(file_content).hexdigest()
  76. )
  77. db.session.add(upload_file)
  78. db.session.commit()
  79. return upload_file, 201
  80. class FilePreviewApi(Resource):
  81. @setup_required
  82. @login_required
  83. @account_initialization_required
  84. def get(self, file_id):
  85. file_id = str(file_id)
  86. key = file_id + request.path
  87. cached_response = cache.get(key)
  88. if cached_response and time.time() - cached_response['timestamp'] < cache.ttl:
  89. return cached_response['response']
  90. upload_file = db.session.query(UploadFile) \
  91. .filter(UploadFile.id == file_id) \
  92. .first()
  93. if not upload_file:
  94. raise NotFound("File not found")
  95. # extract text from file
  96. extension = upload_file.extension
  97. if extension not in ALLOWED_EXTENSIONS:
  98. raise UnsupportedFileTypeError()
  99. with tempfile.TemporaryDirectory() as temp_dir:
  100. suffix = Path(upload_file.key).suffix
  101. filepath = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
  102. storage.download(upload_file.key, filepath)
  103. if extension == 'pdf':
  104. parser = PDFParser({'upload_file': upload_file})
  105. text = parser.parse_file(Path(filepath))
  106. elif extension in ['html', 'htm']:
  107. # Use BeautifulSoup to extract text
  108. parser = HTMLParser()
  109. text = parser.parse_file(Path(filepath))
  110. else:
  111. # ['txt', 'markdown', 'md']
  112. with open(filepath, "rb") as fp:
  113. data = fp.read()
  114. text = data.decode(encoding='utf-8').strip() if data else ''
  115. text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
  116. return {'content': text}
  117. api.add_resource(FileApi, '/files/upload')
  118. api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')