diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index a298af40b20b0..8c2bdfe1cb186 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -1941,6 +1941,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs) llama_reset_timings(llama_ctx_v4); } + generation_finished = false; // Set current generation status + generated_tokens.clear(); // New Generation, new tokens + concat_output_mtx.lock(); concat_output = ""; concat_output_reader_copy_poll = ""; @@ -2140,8 +2143,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs) bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1; - generation_finished = false; // Set current generation status - generated_tokens.clear(); // New Generation, new tokens std::string grammarstr = inputs.grammar; bool grammar_retain_state = inputs.grammar_retain_state; diff --git a/koboldcpp.py b/koboldcpp.py index 8b8a901f11a6f..d1e0b0c98efdc 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -41,7 +41,7 @@ modelbusy = threading.Lock() requestsinqueue = 0 defaultport = 5001 -KcppVersion = "1.73" +KcppVersion = "1.73.1" showdebug = True guimode = False showsamplerwarning = True @@ -1412,11 +1412,7 @@ def run_blocking(): # api format 1=basic,2=kai,3=oai,4=oai-chat global last_non_horde_req_time last_non_horde_req_time = time.time() - return generate( - genparams=genparams, - is_quiet=is_quiet, - stream_flag=stream_flag - ) + return generate(genparams=genparams,is_quiet=is_quiet,stream_flag=stream_flag) genout = {"text": "", "status": -1, "stopreason": -1} if stream_flag: @@ -1486,7 +1482,7 @@ async def handle_sse_stream(self, genparams, api_format): current_token = 0 incomplete_token_buffer = bytearray() async_sleep_short = 0.02 - await asyncio.sleep(0.3) #anti race condition, prevent check from overtaking generate + await asyncio.sleep(0.5) #anti race condition, prevent check from overtaking generate try: tokenReserve = "" #keeps fully formed tokens that we cannot send out yet while True: