131 lines
4.9 KiB
Python
131 lines
4.9 KiB
Python
# makes a request to a local ollama server and streams the response
|
|
import requests
|
|
import sys
|
|
import time
|
|
import json
|
|
import argparse
|
|
|
|
OLLAMA_SERVER = "10.48.9.106"
|
|
OLLAMA_PORT = 11434
|
|
SYSTEM_PROMPTS = [
|
|
"Always answer in a CLI-friendly, plain text format. Use Markdown code blocks for code, and ensure code is easy to copy and paste. Do not use colors or special formatting that breaks copy-paste. Only use plain Markdown for code.",
|
|
]
|
|
HIST_CONTEXT = []
|
|
|
|
def ping_server():
|
|
url = f"http://{OLLAMA_SERVER}:{OLLAMA_PORT}"
|
|
try:
|
|
response = requests.get(url)
|
|
if response.status_code == 200:
|
|
print("Ollama server is reachable.")
|
|
return True
|
|
else:
|
|
print(f"Failed to reach Ollama server: {response.status_code}")
|
|
return False
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Error connecting to Ollama server: {e}")
|
|
return False
|
|
|
|
def stream_ollama_response(model, prompt, output_format="cli"):
|
|
url = f"http://{OLLAMA_SERVER}:{OLLAMA_PORT}/api/chat"
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
}
|
|
data = {
|
|
"model": model,
|
|
"messages": [],
|
|
"stream": True
|
|
}
|
|
|
|
for msg in SYSTEM_PROMPTS:
|
|
data["messages"].append({"system": msg})
|
|
|
|
for hist in HIST_CONTEXT:
|
|
if hist["prompt"] and hist["response"]:
|
|
data["messages"].append({"role": "user", "content": hist["prompt"]})
|
|
if hist["thought"]:
|
|
data["messages"].append({"role": "assistant", "content": hist["thought"]})
|
|
data["messages"].append({"role": "assistant", "content": hist["response"]})
|
|
|
|
data["messages"].append({"role": "user", "content": prompt})
|
|
|
|
response = requests.post(url, headers=headers, json=data, stream=True)
|
|
if response.status_code != 200:
|
|
print(f"Error: {response.status_code} - {response.text}")
|
|
return ""
|
|
|
|
print("Thinking (plain text, code blocks are copy-paste ready):")
|
|
in_thinking = True
|
|
thoughts = ""
|
|
response_text = ""
|
|
message = {}
|
|
|
|
for line in response.iter_lines():
|
|
if line:
|
|
decoded_line = line.decode('utf-8')
|
|
if decoded_line.startswith("data: "):
|
|
decoded_line = decoded_line[6:]
|
|
if decoded_line == "[DONE]":
|
|
break
|
|
try:
|
|
chunk = json.loads(decoded_line)
|
|
if 'message' in chunk:
|
|
message = chunk['message']
|
|
# Stream 'thinking' tokens live
|
|
if 'thinking' in message and message['thinking']:
|
|
sys.stdout.write(message['thinking'])
|
|
sys.stdout.flush()
|
|
thoughts += message['thinking']
|
|
# When content starts, print a newline and stream content live
|
|
if 'content' in message and message['content']:
|
|
if in_thinking:
|
|
print("\n\nResponse (copy-paste code blocks as needed):")
|
|
in_thinking = False
|
|
sys.stdout.write(message['content'])
|
|
sys.stdout.flush()
|
|
response_text += message['content']
|
|
except json.JSONDecodeError:
|
|
continue
|
|
print() # for newline after completion
|
|
|
|
# add to history context
|
|
HIST_CONTEXT.append({
|
|
"response_time": time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
"prompt": prompt,
|
|
"thought": thoughts,
|
|
"response": response_text
|
|
})
|
|
return response_text
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Stream response from local Ollama server.")
|
|
parser.add_argument("model", type=str, help="The model to use (e.g., 'llama2', 'gpt4o').")
|
|
args = parser.parse_args()
|
|
|
|
if not args.model:
|
|
print("Model is required.")
|
|
sys.exit(1)
|
|
|
|
if not ping_server():
|
|
sys.exit(1)
|
|
|
|
conversation_start_time = time.strftime("%Y%m%d-%H%M%S")
|
|
output_filename = f"conversation_{conversation_start_time}.txt"
|
|
print(f"\nEnter your prompt below. Type 'q', 'quit', 'done', or 'bye' to end the conversation.\nAll responses will be saved to {output_filename}.\n")
|
|
|
|
with open(output_filename, "w", encoding="utf-8") as f:
|
|
f.write(f"Conversation started at {conversation_start_time}\nModel: {args.model}\n\n")
|
|
while True:
|
|
try:
|
|
prompt = input("You: ").strip()
|
|
except (EOFError, KeyboardInterrupt):
|
|
print("\nConversation ended.")
|
|
break
|
|
if prompt.lower() in {"q", "quit", "done", "bye"}:
|
|
print("Conversation ended.")
|
|
break
|
|
if not prompt:
|
|
continue
|
|
f.write(f"You: {prompt}\n")
|
|
response_text = stream_ollama_response(args.model, prompt, output_format="cli")
|
|
f.write(f"AI: {response_text}\n\n") |