- # Talk to Ollama
- print("Thinking... (Sending to local Ollama)")
- response = requests.post('http://localhost:11434/api/chat', json={
- "model": "llama3",
- "messages": chat_history,
- "stream": False
- }, timeout=45)
-
- response.raise_for_status() # Triggers an error if Ollama is broken
- reply_text = response.json()['message']['content']
-
- print(f"Ollama Replied: {reply_text}")
- chat_history.append({"role": "assistant", "content": reply_text})
+ # 2. If they haven't spoken before, initialize their memory with the system prompt
+ if session_id not in chat_memory:
+ chat_memory[session_id] = [{"role": "system", "content": NPC_SYSTEM_PROMPT}]
+
+ # 3. Add the player's new message
+ chat_memory[session_id].append({"role": "user", "content": f"{player_name} says: {message}"})
+
+ # --- THE SLIDING WINDOW FIX ---
+ # If the memory gets longer than 11 messages (1 system prompt + 10 chat messages)
+ if len(chat_memory[session_id]) > 11:
+ # Keep the system prompt at index [0], and grab the 10 most recent messages
+ chat_memory[session_id] = [chat_memory[session_id][0]] + chat_memory[session_id][-10:]
+
+ # 4. The Semaphore Bouncer: Wait in line if the GPU is currently full
+ async with semaphore:
+ print(f"[THINKING] Processing reply for {player_name}...")
+
+ # Use aiohttp to make a non-blocking network request to Ollama
+ async with session.post('http://localhost:11434/api/chat', json={
+ "model": "llama3",
+ "messages": chat_memory[session_id],
+ "stream": False
+ }, timeout=45) as response:
+
+ response.raise_for_status()
+ result = await response.json()
+ reply_text = result['message']['content']