-
Notifications
You must be signed in to change notification settings - Fork 36
Improving the Yelp Bean matching algorithm #300
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 8 commits
40ed781
114cea9
5dc9cd6
02b6913
28aa84e
6d1c8c1
6d4c9c4
f91ddfd
8638b40
4666685
1a9591b
b62ab33
5bc0498
36278ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,7 @@ packaging==23.2 | |
# pyproject-api | ||
# pytest | ||
# tox | ||
pandas==1.5.3 | ||
platformdirs==3.11.0 | ||
# via | ||
# tox | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ flask-api-utils | |
Flask-SQLAlchemy | ||
httplib2 | ||
networkx | ||
pandas | ||
psycopg2-binary | ||
pydantic | ||
pytz | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,9 +25,29 @@ def test_generate_meetings_same_department(session, subscription): | |
preference = subscription.datetime[0] | ||
user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription) | ||
session.add(user_pref) | ||
user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) | ||
user1 = User( | ||
id=1, | ||
email="[email protected]", | ||
meta_data={"department": "dept"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="0", | ||
languages="en, fr", | ||
days_since_start=100, | ||
employee_id="101", | ||
location="UK, London", | ||
) | ||
session.add(user1) | ||
user2 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) | ||
user2 = User( | ||
id=2, | ||
email="[email protected]", | ||
meta_data={"department": "dept"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="101", | ||
languages="en, fr", | ||
days_since_start=100, | ||
employee_id="102", | ||
location="CA, London", | ||
) | ||
session.add(user2) | ||
user_list = [user1, user2] | ||
session.commit() | ||
|
@@ -47,13 +67,53 @@ def test_generate_meetings_with_history(session, subscription): | |
user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription) | ||
session.add(user_pref) | ||
|
||
user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) | ||
user1 = User( | ||
id=1, | ||
email="[email protected]", | ||
meta_data={"department": "dept"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="0", | ||
languages="en, fr", | ||
days_since_start=100, | ||
employee_id="101", | ||
location="UK, London", | ||
) | ||
session.add(user1) | ||
user2 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref]) | ||
user2 = User( | ||
id=2, | ||
email="[email protected]", | ||
meta_data={"department": "dept2"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="101", | ||
languages="en, fr", | ||
days_since_start=100, | ||
employee_id="102", | ||
location="CA, London", | ||
) | ||
session.add(user2) | ||
user3 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) | ||
user3 = User( | ||
id=3, | ||
email="[email protected]", | ||
meta_data={"department": "dept"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="101", | ||
languages="", | ||
days_since_start=100, | ||
employee_id="103", | ||
location="UK, London", | ||
) | ||
session.add(user3) | ||
user4 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref]) | ||
user4 = User( | ||
id=4, | ||
email="[email protected]", | ||
meta_data={"department": "dept2"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="101", | ||
languages="en", | ||
days_since_start=100, | ||
employee_id="104", | ||
location="US, SF", | ||
) | ||
session.add(user4) | ||
|
||
user_list = [user1, user2, user3, user4] | ||
|
@@ -102,7 +162,17 @@ def test_no_re_matches(session): | |
users = [] | ||
num_users = 20 | ||
for i in range(0, num_users): | ||
user = User(email=f"{i}@yelp.com", meta_data={"department": f"dept{i}"}, subscription_preferences=[user_pref]) | ||
user = User( | ||
id=i, | ||
email=f"{i}@yelp.com", | ||
meta_data={"department": f"dept{i//2}"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="101", | ||
languages="en", | ||
days_since_start=100, | ||
employee_id=f"{100+i}", | ||
location="", | ||
) | ||
session.add(user) | ||
mr = MeetingRequest(user=user, meeting_spec=meeting_spec) | ||
session.add(mr) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,8 +28,26 @@ def test_generate_save_meetings(session, subscription): | |
pref_1 = SubscriptionDateTime(datetime=datetime.now() - timedelta(weeks=MEETING_COOLDOWN_WEEKS - 1)) | ||
subscription = MeetingSubscription(title="all engineering weekly", datetime=[pref_1]) | ||
user_pref = UserSubscriptionPreferences(preference=pref_1, subscription=subscription) | ||
user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) | ||
user2 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref]) | ||
user1 = User( | ||
email="[email protected]", | ||
meta_data={"department": "dept"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="0", | ||
languages="en, fr", | ||
days_since_start=100, | ||
employee_id="101", | ||
location="UK, London", | ||
) | ||
user2 = User( | ||
email="[email protected]", | ||
meta_data={"department": "dept2"}, | ||
subscription_preferences=[user_pref], | ||
manager_id="101", | ||
languages="en, fr", | ||
days_since_start=100, | ||
employee_id="102", | ||
location="CA, London", | ||
) | ||
meeting_spec = MeetingSpec(meeting_subscription=subscription, datetime=pref_1.datetime) | ||
mr1 = MeetingRequest(user=user1, meeting_spec=meeting_spec) | ||
mr2 = MeetingRequest(user=user2, meeting_spec=meeting_spec) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from yelp_beans.models import Employee | ||
|
||
|
||
def get_employee(work_email): | ||
return Employee.query.filter(Employee.work_email == work_email).first() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,8 @@ | |
from datetime import datetime | ||
from datetime import timedelta | ||
|
||
import networkx as nx | ||
import pandas as pd | ||
from database import db | ||
|
||
from yelp_beans.logic.config import get_config | ||
|
@@ -86,3 +88,96 @@ def get_previous_meetings(subscription, cooldown=None): | |
disallowed_meetings = {tuple([meeting.id for meeting in meeting]) for meeting in disallowed_meetings} | ||
|
||
return disallowed_meetings | ||
|
||
|
||
def jaccard(list1, list2): | ||
intersection = len(list(set(list1).intersection(list2))) | ||
if intersection == 0: | ||
return 1 | ||
else: | ||
union = (len(list1) + len(list2)) - intersection | ||
return float(intersection) / union | ||
|
||
|
||
def get_pairwise_distance( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be possible to make the attributes used configurable? I think it'd be great to have the choice of attributes to apply be something that can be configured differently for different subscriptions |
||
user_pair, | ||
org_graph, | ||
employee_df, | ||
max_tenure=1000, | ||
): | ||
""" | ||
get the distance between two users. | ||
The returned distance score is a linear combination of the multiple user attributes' distnace (normalized). | ||
The importance of each attribute is considered equal. | ||
User attribute considered: | ||
1. team/function: distance in the org chart | ||
2. location - country, city | ||
3. tenure at Yelp | ||
4. language | ||
|
||
note: we considered using education and work experience, but think it likely correlates with the first attribute | ||
""" | ||
user_a, user_b = user_pair | ||
user_a_attributes = dict(employee_df.loc[user_a]) | ||
user_b_attributes = dict(employee_df.loc[user_b]) | ||
|
||
distance = 0 | ||
dist_1 = nx.shortest_path_length(org_graph, user_a, user_b) | ||
dist_1 = dist_1 / 10 # approx. min-max scaled | ||
distance += dist_1 | ||
|
||
# location | ||
try: | ||
user_a_city, user_a_country = user_a_attributes["location"].split(", ") | ||
except ValueError: | ||
user_a_city, user_a_country = "unkown", user_a_attributes["location"] | ||
conancain marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
try: | ||
user_b_city, user_b_country = user_b_attributes["location"].split(", ") | ||
except ValueError: | ||
user_b_city, user_b_country = "unkown", user_b_attributes["location"] | ||
conancain marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
country_dist = 0 if user_a_country == user_b_country else 1 | ||
city_dist = 0 if user_a_city == user_b_city else 1 | ||
dist_2 = country_dist + city_dist | ||
dist_2 = dist_2 / 2 # min-max scaled | ||
distance += dist_2 | ||
|
||
# tenure | ||
dist_3 = abs(int(user_a_attributes["days_since_start"]) - int(user_b_attributes["days_since_start"])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tenure is a bit subjective. I don't have strong opinions here if it doesn't lead to starvation. Fundamental to this assumption is that tenured folks know each other and so optimize for meeting newer less tenured people. I think this works for v1 but I'll be curious to hear feedback on whether folks not getting matched with similarly tenured people gets noticed. Perhaps eventually we should get to a place where we can ask users to tell us their preferences for matching |
||
dist_3 = dist_3 / max_tenure | ||
distance += dist_3 | ||
|
||
# language | ||
lang_similarity = jaccard(user_a_attributes["languages"], user_b_attributes["languages"]) | ||
dist_4 = 1 - lang_similarity | ||
distance += dist_4 | ||
|
||
return distance | ||
|
||
|
||
def get_meeting_weights(allowed_meetings): | ||
""" | ||
generate distance score for each user pairs. | ||
""" | ||
meeting_to_weight = {} | ||
|
||
# need to convert this to JSON to match the previous logic | ||
db_query_result = db.session.query(User).all() | ||
json_dump = [obj.serialize() for obj in db_query_result] | ||
employees = pd.DataFrame(json_dump) | ||
|
||
employees["languages"] = employees["languages"].apply(lambda x: x.split(", ")) | ||
employees = employees[["id", "manager_id", "days_since_start", "location", "languages", "email", "employee_id"]] | ||
employees = employees.merge( | ||
employees[["employee_id", "id"]], how="left", left_on="manager_id", right_on="employee_id", suffixes=("", "_manager") | ||
) | ||
employees = employees.set_index("id", drop=False) | ||
max_tenure = max(employees["days_since_start"].astype(int)) | ||
|
||
# yelp employee network graph created through reporting line | ||
G = nx.Graph() | ||
G.add_edges_from(list(zip(employees["id"], employees["id_manager"]))) | ||
for user_pair in allowed_meetings: | ||
users_distance_score = get_pairwise_distance(user_pair, org_graph=G, employee_df=employees.copy(), max_tenure=max_tenure) | ||
meeting_to_weight[user_pair] = users_distance_score | ||
|
||
return meeting_to_weight |
Uh oh!
There was an error while loading. Please reload this page.