Spaces:
Runtime error
Runtime error
| import os | |
| os.system('pip install openpyxl') | |
| os.system('pip install sentence-transformers') | |
| import pandas as pd | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2 | |
| df = pd.read_parquet('df_encoded3.parquet') | |
| df['tags'] = df['tags'].apply(lambda x : str(x)) | |
| def parse_raised(x): | |
| if x == 'Undisclosed': | |
| return 0 | |
| else: | |
| quantifier = x[-1] | |
| x = float(x[1:-1]) | |
| if quantifier == 'K': | |
| return x/1000 | |
| elif quantifier == 'M': | |
| return x | |
| df['raised'] = df['raised'].apply(lambda x : parse_raised(x)) | |
| df['stage'] = df['stage'].apply(lambda x : x.lower()) | |
| df = df.reset_index(drop=True) | |
| from sklearn.neighbors import NearestNeighbors | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist()) | |
| def search(df, query): | |
| product = model.encode(query).tolist() | |
| # product = df.iloc[0]['text_vector_'] #use one of the products as sample | |
| #prepare model | |
| # | |
| distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object | |
| #print out the description of every recommended product | |
| return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags']] | |
| def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0): | |
| if filter_type == '==': | |
| df_filtered = df[df[column_name]==filter_value] | |
| elif filter_type == '>=': | |
| df_filtered = df[df[column_name]>=filter_value] | |
| elif filter_type == '<=': | |
| df_filtered = df[df[column_name]<=filter_value] | |
| elif filter_type == 'contains': | |
| df_filtered = df[df['target'].str.contains(filter_value)] | |
| if df_filtered.size >= minimum_acceptable_size: | |
| return df_filtered | |
| else: | |
| return df | |
| #the first module becomes text1, the second module file1 | |
| def greet(size, target, stage, query): | |
| def raised_zero(x): | |
| if x == 0: | |
| return 'Undisclosed' | |
| else: | |
| return x | |
| df_knn = search(df, query) | |
| df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x)) | |
| df_size = filter_df(df_knn, 'size', '==', size, 1) | |
| if stage != 'ALL': | |
| df_stage = filter_df(df_size, 'stage', '==', stage.lower(), 1) | |
| else: | |
| #we bypass the filter | |
| df_stage = df_size | |
| print(df_stage.size) | |
| df_target = filter_df(df_stage, 'target', 'contains', target, 1) | |
| # display(df_stage) | |
| # df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)] | |
| #we live the sorting for last | |
| return df_target[0:100] #.sort_values('raised', ascending=False) | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo: | |
| gr.Markdown( | |
| """ | |
| # Startup Search Engine | |
| """ | |
| ) | |
| size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size') | |
| target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target') | |
| stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'ALL'], multiselect=False, value='ALL', label='stage') | |
| # raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)") | |
| query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing') | |
| btn = gr.Button(value="Search for a Startup") | |
| output1 = gr.DataFrame(label='value') | |
| # btn.click(greet, inputs='text', outputs=['dataframe']) | |
| btn.click(greet, [size, target, stage, query], [output1]) | |
| demo.launch(share=False) |