|
| 1 | +# server.py |
| 2 | + |
| 3 | +from flask import Flask, request, send_file, jsonify |
| 4 | +from gevent.pywsgi import WSGIServer |
| 5 | +from dotenv import load_dotenv |
| 6 | +import os |
| 7 | + |
| 8 | +from handle_text import prepare_tts_input_with_context |
| 9 | +from tts_handler import generate_speech, get_models, get_voices |
| 10 | +from utils import getenv_bool, require_api_key, AUDIO_FORMAT_MIME_TYPES |
| 11 | + |
| 12 | +app = Flask(__name__) |
| 13 | +load_dotenv() |
| 14 | + |
| 15 | +API_KEY = os.getenv('API_KEY', 'your_api_key_here') |
| 16 | +PORT = int(os.getenv('PORT', 5050)) |
| 17 | + |
| 18 | +DEFAULT_VOICE = os.getenv('DEFAULT_VOICE', 'en-US-AvaNeural') |
| 19 | +DEFAULT_RESPONSE_FORMAT = os.getenv('DEFAULT_RESPONSE_FORMAT', 'mp3') |
| 20 | +DEFAULT_SPEED = float(os.getenv('DEFAULT_SPEED', 1.0)) |
| 21 | + |
| 22 | +REMOVE_FILTER = getenv_bool('REMOVE_FILTER', False) |
| 23 | +EXPAND_API = getenv_bool('EXPAND_API', True) |
| 24 | + |
| 25 | +# DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'tts-1') |
| 26 | + |
| 27 | +@app.route('/v1/audio/speech', methods=['POST']) |
| 28 | +@app.route('/audio/speech', methods=['POST']) # Add this line for the alias |
| 29 | +@require_api_key |
| 30 | +def text_to_speech(): |
| 31 | + data = request.json |
| 32 | + if not data or 'input' not in data: |
| 33 | + return jsonify({"error": "Missing 'input' in request body"}), 400 |
| 34 | + |
| 35 | + text = data.get('input') |
| 36 | + |
| 37 | + if not REMOVE_FILTER: |
| 38 | + text = prepare_tts_input_with_context(text) |
| 39 | + |
| 40 | + # model = data.get('model', DEFAULT_MODEL) |
| 41 | + voice = data.get('voice', DEFAULT_VOICE) |
| 42 | + |
| 43 | + response_format = data.get('response_format', DEFAULT_RESPONSE_FORMAT) |
| 44 | + speed = float(data.get('speed', DEFAULT_SPEED)) |
| 45 | + |
| 46 | + mime_type = AUDIO_FORMAT_MIME_TYPES.get(response_format, "audio/mpeg") |
| 47 | + |
| 48 | + # Generate the audio file in the specified format with speed adjustment |
| 49 | + output_file_path = generate_speech(text, voice, response_format, speed) |
| 50 | + |
| 51 | + # Return the file with the correct MIME type |
| 52 | + return send_file(output_file_path, mimetype=mime_type, as_attachment=True, download_name=f"speech.{response_format}") |
| 53 | + |
| 54 | +@app.route('/v1/models', methods=['GET', 'POST']) |
| 55 | +@app.route('/models', methods=['GET', 'POST']) |
| 56 | +@require_api_key |
| 57 | +def list_models(): |
| 58 | + return jsonify({"data": get_models()}) |
| 59 | + |
| 60 | +@app.route('/v1/voices', methods=['GET', 'POST']) |
| 61 | +@app.route('/voices', methods=['GET', 'POST']) |
| 62 | +@require_api_key |
| 63 | +def list_voices(): |
| 64 | + specific_language = None |
| 65 | + |
| 66 | + data = request.args if request.method == 'GET' else request.json |
| 67 | + if data and ('language' in data or 'locale' in data): |
| 68 | + specific_language = data.get('language') if 'language' in data else data.get('locale') |
| 69 | + |
| 70 | + return jsonify({"voices": get_voices(specific_language)}) |
| 71 | + |
| 72 | +@app.route('/v1/voices/all', methods=['GET', 'POST']) |
| 73 | +@app.route('/voices/all', methods=['GET', 'POST']) |
| 74 | +@require_api_key |
| 75 | +def list_all_voices(): |
| 76 | + return jsonify({"voices": get_voices('all')}) |
| 77 | + |
| 78 | +""" |
| 79 | +Support for ElevenLabs and Azure AI Speech |
| 80 | + (currently in beta) |
| 81 | +""" |
| 82 | + |
| 83 | +# http://localhost:5050/elevenlabs/v1/text-to-speech |
| 84 | +# http://localhost:5050/elevenlabs/v1/text-to-speech/en-US-AndrewNeural |
| 85 | +@app.route('/elevenlabs/v1/text-to-speech/<voice_id>', methods=['POST']) |
| 86 | +@require_api_key |
| 87 | +def elevenlabs_tts(voice_id): |
| 88 | + if not EXPAND_API: |
| 89 | + return jsonify({"error": f"Endpoint not allowed"}), 500 |
| 90 | + |
| 91 | + # Parse the incoming JSON payload |
| 92 | + try: |
| 93 | + payload = request.json |
| 94 | + if not payload or 'text' not in payload: |
| 95 | + return jsonify({"error": "Missing 'text' in request body"}), 400 |
| 96 | + except Exception as e: |
| 97 | + return jsonify({"error": f"Invalid JSON payload: {str(e)}"}), 400 |
| 98 | + |
| 99 | + text = payload['text'] |
| 100 | + |
| 101 | + if not REMOVE_FILTER: |
| 102 | + text = prepare_tts_input_with_context(text) |
| 103 | + |
| 104 | + voice = voice_id # ElevenLabs uses the voice_id in the URL |
| 105 | + |
| 106 | + # Use default settings for edge-tts |
| 107 | + response_format = 'mp3' |
| 108 | + speed = DEFAULT_SPEED # Optional customization via payload.get('speed', DEFAULT_SPEED) |
| 109 | + |
| 110 | + # Generate speech using edge-tts |
| 111 | + try: |
| 112 | + output_file_path = generate_speech(text, voice, response_format, speed) |
| 113 | + except Exception as e: |
| 114 | + return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500 |
| 115 | + |
| 116 | + # Return the generated audio file |
| 117 | + return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3") |
| 118 | + |
| 119 | +# tts.speech.microsoft.com/cognitiveservices/v1 |
| 120 | +# https://{region}.tts.speech.microsoft.com/cognitiveservices/v1 |
| 121 | +# http://localhost:5050/azure/cognitiveservices/v1 |
| 122 | +@app.route('/azure/cognitiveservices/v1', methods=['POST']) |
| 123 | +@require_api_key |
| 124 | +def azure_tts(): |
| 125 | + if not EXPAND_API: |
| 126 | + return jsonify({"error": f"Endpoint not allowed"}), 500 |
| 127 | + |
| 128 | + # Parse the SSML payload |
| 129 | + try: |
| 130 | + ssml_data = request.data.decode('utf-8') |
| 131 | + if not ssml_data: |
| 132 | + return jsonify({"error": "Missing SSML payload"}), 400 |
| 133 | + |
| 134 | + # Extract the text and voice from SSML |
| 135 | + from xml.etree import ElementTree as ET |
| 136 | + root = ET.fromstring(ssml_data) |
| 137 | + text = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').text |
| 138 | + voice = root.find('.//{http://www.w3.org/2001/10/synthesis}voice').get('name') |
| 139 | + except Exception as e: |
| 140 | + return jsonify({"error": f"Invalid SSML payload: {str(e)}"}), 400 |
| 141 | + |
| 142 | + # Use default settings for edge-tts |
| 143 | + response_format = 'mp3' |
| 144 | + speed = DEFAULT_SPEED |
| 145 | + |
| 146 | + if not REMOVE_FILTER: |
| 147 | + text = prepare_tts_input_with_context(text) |
| 148 | + |
| 149 | + # Generate speech using edge-tts |
| 150 | + try: |
| 151 | + output_file_path = generate_speech(text, voice, response_format, speed) |
| 152 | + except Exception as e: |
| 153 | + return jsonify({"error": f"TTS generation failed: {str(e)}"}), 500 |
| 154 | + |
| 155 | + # Return the generated audio file |
| 156 | + return send_file(output_file_path, mimetype="audio/mpeg", as_attachment=True, download_name="speech.mp3") |
| 157 | + |
| 158 | +print(f" Edge TTS (Free Azure TTS) Replacement for OpenAI's TTS API") |
| 159 | +print(f" ") |
| 160 | +print(f" * Serving OpenAI Edge TTS") |
| 161 | +print(f" * Server running on http://localhost:{PORT}") |
| 162 | +print(f" * TTS Endpoint: http://localhost:{PORT}/v1/audio/speech") |
| 163 | +print(f" ") |
| 164 | + |
| 165 | +if __name__ == '__main__': |
| 166 | + http_server = WSGIServer(('0.0.0.0', PORT), app) |
| 167 | + http_server.serve_forever() |
0 commit comments