From 54a41046ce4b7b1a14f14150f003ac38b73d4502 Mon Sep 17 00:00:00 2001 From: Amir Sabani Date: Mon, 8 Jan 2024 00:28:20 +0100 Subject: [PATCH] Fixed response/JSON --- .../scrapingsingle.cpython-310.pyc | Bin 4361 -> 4498 bytes pyth/__pycache__/vectData.cpython-310.pyc | Bin 6449 -> 6449 bytes pyth/articles.py | 5 ++- pyth/scrapingsingle.py | 41 +++++++++--------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pyth/__pycache__/scrapingsingle.cpython-310.pyc b/pyth/__pycache__/scrapingsingle.cpython-310.pyc index 2598ed55e69e99b6c90bdb13b4eeebcdb0e7a35c..b39ce5c530a438802d33a09755610775c64abdd0 100644 GIT binary patch delta 2015 zcmZ8iO;8h89N)LuY&Kt;009!@BM>18C_=yhiXe&~lv~C4Ho|tqh}cmhYU_r+ z=NmI(=o_%&c8k%nr)xD@QKwmLcEU&y30k%#MBT%{ZaWP=Aw~zN+mg`bq-mO@DWv@{fT=Pb z?x$UJAL<4@X$Vq_(#N`yg+a)|(6(&kAjj@QW;YMHCxl2>Kc(HYXGhwC%%ZeSjA0mn z=(gZSU>HV{l5qeIc-mpl+7C^7koIoK_Xz_S#sc3Oqag1df>G#tA{%2sV5~Y$`>X_z*qp@@!YC}I;u z=!I#d(L zc!y5H(nIO7yp7-Gl5`oz(O9~&boF_Y-XJ$n^MQBrMgBt0^buD*zX`R4r95%tP41Mv z0+gDyij!CPsIQOPzEm%o!&bdaOSImA+I1l^*esi@RCVgLZ@%jQOY+D5Zt{|U=f9APqSE{#AW6^sERH;D5zr=}6~R?0 zU@myZ3u=b6^N-YAxI-8{=;U|Q?w)RxUkYc1C?teWgs6BzD6xzvb%CYOn)kiOM72LD z0_zeWBAc0-=}>gr9m`yWQp0RmP%_s_H2?}UD?IlL7((Flk#_E?Ka&prRiMB9O>l6w z0yVg~$>t{ci@^KMR!2lwC!g1nEd^v96+t0M(cTze*J>oof7g}^Z@m^exWP5^X2~*Z z8_tvn68EbT4WEd3WDK6+7lS{MIX)PAmrQaNI+2?dfqpNDi6}QD_|o;Vbq(HrPyR>f zI623A!o`)>Rr4~-ix9_!PzGlmXsp)l%{c=_M@C3U?7T2(rIKAXYo!u9ield9 z%IgI@<9H@m4pn$rDgI-4dcsTZ_b>`9rgo=`;(N81hmSNJ`A$Gm-R>izu)U1zbYd z>yz~e#p~`_rw~PRmSCR-(q_X+}tJ+Gv3IiJ>4i)OJFWnv%zu76>!*Pk-x-$i%zwQY059Vz7=PZiH_Mk z@6|I~u3UHAgjKFpR@d=nM&b2NF~67VI;=Jp%Z)MI~Eq6Y6ZkPvWO?FS|;}GhJb&sx8@gw&gV3k>oaQ&CiahI1c>ovV*+Lb z6!_nIzj{uvr+Hs&IV;A79T#v$fOpGh1$Pp`RlTq0uu=XXHlX+#jSc=hc7&ApU$J3w znJ42jTW>yz$BFOaYP0ymaLs}_f|BzHAtE&gSzyu|YQd?pN7{hGiwNN zXd(qw>H!3-Pz97KL29K|s)X8mrJj4?(#!q_m3r$P_02j^nrMGB@AuyC&70l#${Y7fVNFo3c`q; zCBipDFDdKdVIDCl>*iBD20h$>-gTA7dE#D>?dQ$Bazz3;9G?c@nM*MPWT995i>7>k}WCx;C+Q1 zgJb1IPEw@&Vfi>)f+e=J39UOPUOB*J_)(!>u=6?J76d@4OB1MR-ue zsup;jSL;x@_3N`hji#D$ZoD8ygXQT^DWa@W`jLo~n+{%%ccRx(5yI4?52+U zQ}EnK0!j0A6Cs(PXvMuqN@$nRhTv%&Fc)Op0WD2C+*8^}Bqg0bbh>x6?w($>ofZU@ zsx+*GX;jhh2~$-OX}7ECFPd_LdVgC(B9n#|N>G8@HL*D99@Wp!_?{7&Ru<~D3aD9y zBIGyB`n9}M!>u!p4o5>!X$Kx7#@yfa3LSJOL#xy86(r@Mb@O)KGAq}e87VRlG^#5) zjmm!otD19v4!x#F-7mvu>6H6N_{2z73WL5#NR=0s`~0nfbrTwQZB*~hMve#05aRyU z)b2ir^yc54ol(S)6sl$&?Dr*-3@tcNFIMf^;uSO<1tCeqS?SXAdAnd%@_BIt%}l^k zu1(`Jh0kGez>P#NWc|Fsx>>jWSBq4K7cA3hWVX=$M`rFI;*#{ynrlQ;bK(;;#CZu9 z5ccX6J(74=z5n8USyl3%6Zh}vQ~JdHDz>H;oW8IK%E86hhZ3+@vmy(xlHp;R3&X%#A^;x)Yg zIJ_GgAwRRH--2R&8HYw-a}uFT2uYAX5h5ZXDj|m8sb!~H!Cv(TLdY-fH?ybVMB#{t zxD3*YmWic%?rc1{x+s}|Qx{&qtkgx9l%yq$Ntlx`f#5~z)q25dw3BBPtCodj0nb-p z!x7jBM4WV=##e`B(}-gdmL>RacSY`wBY2ws@f int: encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(string)) def slice_text_at_2k_tokens(text): encoding_name = "gpt-3.5-turbo" - max_tokens = 2000 - + max_tokens = 1950 encoding = tiktoken.encoding_for_model(encoding_name) tokens = encoding.encode(text) - if len(tokens) <= max_tokens: return [text] - sliced_tokens = tokens[:max_tokens] sliced_text = encoding.decode(sliced_tokens) - return sliced_text +def slice_title_if_needed(text): + encoding_name = "gpt-3.5-turbo" + max_tokens = 100 + encoding = tiktoken.encoding_for_model(encoding_name) + tokens = encoding.encode(text) + if len(tokens) <= max_tokens: + return [text] + sliced_tokens = tokens[:max_tokens] + sliced_text = encoding.decode(sliced_tokens) + return sliced_text def replace_with_spaces(text): allowed_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčĆćDždžĐ𩹮ž0123456789 " cleaned_text = ''.join(char if char in allowed_chars else ' ' for char in text) return cleaned_text - def fix_links(links_set): modified_links = set() - for link in links_set: if "www" in link: modified_link = link.replace("www.", "") modified_links.add(modified_link) else: modified_links.add(link) - return modified_links total_links = set() collected_news = set() - def get_article_links(url, already_checked): response = requests.get(url,headers) if response.status_code == 200: @@ -81,25 +81,22 @@ def get_article_links(url, already_checked): already_checked.add(link_value) return link_store - already_checked = set() for dlink in dlinks: temp_links = get_article_links(dlink, already_checked) if temp_links: total_links.update(temp_links) - final_links = {item for item in total_links if item} db_links = set(get_all_links()) new_links = final_links - db_links final_links = new_links final_links = set(final_links) - final_links = fix_links(final_links) if __name__ == '__main__': - + for link in final_links: response = requests.get(link,headers) soup = BeautifulSoup(response.text, 'html.parser') @@ -117,24 +114,26 @@ if __name__ == '__main__': text_text = slice_text_at_2k_tokens(text_text) text_text = replace_with_spaces(str(text_text)) - + + ttk = num_tokens_from_string(text_text) + + if ttk > 1900: + title_text = slice_title_if_needed(title_text) try: completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Data analytic, Journalist and News reporter"}, - {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data as a JSON object with 'title' and 'content' fields."} + {"role": "user", "content": rf"Extract relevant information from the following input: Title: {title_text}, Text: {text_text}. Remove any non-news element related to the current text and title, and provide the cleaned data make sure that its valid JSON object with 'title' field and 'content' field."} ] ) generated_text = completion.choices[0].message.content - generated_text = generated_text + generated_text = repair_json(generated_text) response_data = json.loads(generated_text) - title = response_data["title"] text = response_data["content"] - vector = embeddings.embed_query(generated_text) if not is_similar_data(title, text, link, vector, threshold=0.98):