- # Talk to Ollama
- print("Thinking... (Sending to local Ollama)")
- response = requests.post('http://localhost:11434/api/chat', json={
- "model": "llama3",
- "messages": chat_history,
- "stream": False
- }, timeout=45)
-
- response.raise_for_status() # Triggers an error if Ollama is broken
- reply_text = response.json()['message']['content']
-
- print(f"Ollama Replied: {reply_text}")
- chat_history.append({"role": "assistant", "content": reply_text})
+ # 2. If they haven't spoken before, initialize their memory with the system prompt
+ if session_id not in chat_memory:
+ chat_memory[session_id] = [{"role": "system", "content": NPC_SYSTEM_PROMPT}]
+
+ # 3. Add the player's new message to their specific history
+ print(f"\n[RECEIVED] {player_name} -> {npc_tag}: '{message}'")
+ chat_memory[session_id].append({"role": "user", "content": f"{player_name} says: {message}"})
+
+ # 4. The Semaphore Bouncer: Wait in line if the GPU is currently full
+ async with semaphore:
+ print(f"[THINKING] Processing reply for {player_name}...")
+
+ # Use aiohttp to make a non-blocking network request to Ollama
+ async with session.post('http://localhost:11434/api/chat', json={
+ "model": "llama3",
+ "messages": chat_memory[session_id],
+ "stream": False
+ }, timeout=45) as response:
+
+ response.raise_for_status()
+ result = await response.json()
+ reply_text = result['message']['content']
+
+ # 5. Save the AI's reply to the memory and print it
+ print(f"[REPLY] Elrendur to {player_name}: {reply_text}")
+ chat_memory[session_id].append({"role": "assistant", "content": reply_text})