Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ packaging==23.2
# pyproject-api
# pytest
# tox
pandas==1.5.3
platformdirs==3.11.0
# via
# tox
Expand Down
1 change: 1 addition & 0 deletions api/requirements-minimal.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ flask-api-utils
Flask-SQLAlchemy
httplib2
networkx
pandas
psycopg2-binary
pydantic
pytz
Expand Down
1 change: 1 addition & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ markupsafe==2.1.3
# werkzeug
networkx==3.1
# via -r requirements-minimal.txt
pandas==1.5.3
psycopg2-binary==2.9.9
# via -r requirements-minimal.txt
pydantic==2.4.2
Expand Down
84 changes: 77 additions & 7 deletions api/tests/matching/match_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,29 @@ def test_generate_meetings_same_department(session, subscription):
preference = subscription.datetime[0]
user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription)
session.add(user_pref)
user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user1 = User(
id=1,
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="0",
languages="en, fr",
days_since_start=100,
employee_id="101",
location="UK, London",
)
session.add(user1)
user2 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user2 = User(
id=2,
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en, fr",
days_since_start=100,
employee_id="102",
location="CA, London",
)
session.add(user2)
user_list = [user1, user2]
session.commit()
Expand All @@ -47,13 +67,53 @@ def test_generate_meetings_with_history(session, subscription):
user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription)
session.add(user_pref)

user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user1 = User(
id=1,
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="0",
languages="en, fr",
days_since_start=100,
employee_id="101",
location="UK, London",
)
session.add(user1)
user2 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
user2 = User(
id=2,
email="[email protected]",
meta_data={"department": "dept2"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en, fr",
days_since_start=100,
employee_id="102",
location="CA, London",
)
session.add(user2)
user3 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user3 = User(
id=3,
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="101",
languages="",
days_since_start=100,
employee_id="103",
location="UK, London",
)
session.add(user3)
user4 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
user4 = User(
id=4,
email="[email protected]",
meta_data={"department": "dept2"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en",
days_since_start=100,
employee_id="104",
location="US, SF",
)
session.add(user4)

user_list = [user1, user2, user3, user4]
Expand Down Expand Up @@ -102,7 +162,17 @@ def test_no_re_matches(session):
users = []
num_users = 20
for i in range(0, num_users):
user = User(email=f"{i}@yelp.com", meta_data={"department": f"dept{i}"}, subscription_preferences=[user_pref])
user = User(
id=i,
email=f"{i}@yelp.com",
meta_data={"department": f"dept{i//2}"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en",
days_since_start=100,
employee_id=f"{100+i}",
location="",
)
session.add(user)
mr = MeetingRequest(user=user, meeting_spec=meeting_spec)
session.add(mr)
Expand Down
22 changes: 20 additions & 2 deletions api/tests/matching/match_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,26 @@ def test_generate_save_meetings(session, subscription):
pref_1 = SubscriptionDateTime(datetime=datetime.now() - timedelta(weeks=MEETING_COOLDOWN_WEEKS - 1))
subscription = MeetingSubscription(title="all engineering weekly", datetime=[pref_1])
user_pref = UserSubscriptionPreferences(preference=pref_1, subscription=subscription)
user1 = User(email="[email protected]", meta_data={"department": "dept"}, subscription_preferences=[user_pref])
user2 = User(email="[email protected]", meta_data={"department": "dept2"}, subscription_preferences=[user_pref])
user1 = User(
email="[email protected]",
meta_data={"department": "dept"},
subscription_preferences=[user_pref],
manager_id="0",
languages="en, fr",
days_since_start=100,
employee_id="101",
location="UK, London",
)
user2 = User(
email="[email protected]",
meta_data={"department": "dept2"},
subscription_preferences=[user_pref],
manager_id="101",
languages="en, fr",
days_since_start=100,
employee_id="102",
location="CA, London",
)
meeting_spec = MeetingSpec(meeting_subscription=subscription, datetime=pref_1.datetime)
mr1 = MeetingRequest(user=user1, meeting_spec=meeting_spec)
mr2 = MeetingRequest(user=user2, meeting_spec=meeting_spec)
Expand Down
5 changes: 5 additions & 0 deletions api/yelp_beans/logic/employee.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from yelp_beans.models import Employee


def get_employee(work_email):
return Employee.query.filter(Employee.work_email == work_email).first()
95 changes: 95 additions & 0 deletions api/yelp_beans/matching/match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from datetime import datetime
from datetime import timedelta

import networkx as nx
import pandas as pd
from database import db

from yelp_beans.logic.config import get_config
Expand Down Expand Up @@ -86,3 +88,96 @@ def get_previous_meetings(subscription, cooldown=None):
disallowed_meetings = {tuple([meeting.id for meeting in meeting]) for meeting in disallowed_meetings}

return disallowed_meetings


def jaccard(list1, list2):
intersection = len(list(set(list1).intersection(list2)))
if intersection == 0:
return 1
else:
union = (len(list1) + len(list2)) - intersection
return float(intersection) / union


def get_pairwise_distance(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to make the attributes used configurable? I think it'd be great to have the choice of attributes to apply be something that can be configured differently for different subscriptions

user_pair,
org_graph,
employee_df,
max_tenure=1000,
):
"""
get the distance between two users.
The returned distance score is a linear combination of the multiple user attributes' distnace (normalized).
The importance of each attribute is considered equal.
User attribute considered:
1. team/function: distance in the org chart
2. location - country, city
3. tenure at Yelp
4. language

note: we considered using education and work experience, but think it likely correlates with the first attribute
"""
user_a, user_b = user_pair
user_a_attributes = dict(employee_df.loc[user_a])
user_b_attributes = dict(employee_df.loc[user_b])

distance = 0
dist_1 = nx.shortest_path_length(org_graph, user_a, user_b)
dist_1 = dist_1 / 10 # approx. min-max scaled
distance += dist_1

# location
try:
user_a_city, user_a_country = user_a_attributes["location"].split(", ")
except ValueError:
user_a_city, user_a_country = "unkown", user_a_attributes["location"]
try:
user_b_city, user_b_country = user_b_attributes["location"].split(", ")
except ValueError:
user_b_city, user_b_country = "unkown", user_b_attributes["location"]
country_dist = 0 if user_a_country == user_b_country else 1
city_dist = 0 if user_a_city == user_b_city else 1
dist_2 = country_dist + city_dist
dist_2 = dist_2 / 2 # min-max scaled
distance += dist_2

# tenure
dist_3 = abs(int(user_a_attributes["days_since_start"]) - int(user_b_attributes["days_since_start"]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tenure is a bit subjective. I don't have strong opinions here if it doesn't lead to starvation. Fundamental to this assumption is that tenured folks know each other and so optimize for meeting newer less tenured people.

I think this works for v1 but I'll be curious to hear feedback on whether folks not getting matched with similarly tenured people gets noticed. Perhaps eventually we should get to a place where we can ask users to tell us their preferences for matching

dist_3 = dist_3 / max_tenure
distance += dist_3

# language
lang_similarity = jaccard(user_a_attributes["languages"], user_b_attributes["languages"])
dist_4 = 1 - lang_similarity
distance += dist_4

return distance


def get_meeting_weights(allowed_meetings):
"""
generate distance score for each user pairs.
"""
meeting_to_weight = {}

# need to convert this to JSON to match the previous logic
db_query_result = db.session.query(User).all()
json_dump = [obj.serialize() for obj in db_query_result]
employees = pd.DataFrame(json_dump)

employees["languages"] = employees["languages"].apply(lambda x: x.split(", "))
employees = employees[["id", "manager_id", "days_since_start", "location", "languages", "email", "employee_id"]]
employees = employees.merge(
employees[["employee_id", "id"]], how="left", left_on="manager_id", right_on="employee_id", suffixes=("", "_manager")
)
employees = employees.set_index("id", drop=False)
max_tenure = max(employees["days_since_start"].astype(int))

# yelp employee network graph created through reporting line
G = nx.Graph()
G.add_edges_from(list(zip(employees["id"], employees["id_manager"])))
for user_pair in allowed_meetings:
users_distance_score = get_pairwise_distance(user_pair, org_graph=G, employee_df=employees.copy(), max_tenure=max_tenure)
meeting_to_weight[user_pair] = users_distance_score

return meeting_to_weight
12 changes: 6 additions & 6 deletions api/yelp_beans/matching/pair_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import networkx as nx

from yelp_beans.logic.user import user_preference
from yelp_beans.matching.match_utils import get_meeting_weights
from yelp_beans.matching.match_utils import get_previous_meetings


Expand Down Expand Up @@ -78,16 +79,15 @@ def construct_graph(user_ids, disallowed_meetings):
Yay graphs! Networkx will do all the work for us.
"""

# special weights that be put on the matching potential of each meeting,
# depending on heuristics for what makes a good/bad potential meeting.
meeting_to_weight = {}

# This creates the graph and the maximal matching set is returned.
# It does not return anyone who didn't get matched.
meetings = []
possible_meetings = {meeting for meeting in itertools.combinations(user_ids, 2)}
allowed_meetings = possible_meetings - disallowed_meetings
possible_meetings = {tuple(sorted(meeting)) for meeting in itertools.combinations(user_ids, 2)}
allowed_meetings = possible_meetings - {tuple(sorted(a)) for a in disallowed_meetings}

# special weights that be put on the matching potential of each meeting,
# depending on heuristics for what makes a good/bad potential meeting.
meeting_to_weight = get_meeting_weights(allowed_meetings)
for meeting in allowed_meetings:
weight = meeting_to_weight.get(meeting, 1.0)
meetings.append((*meeting, {"weight": weight}))
Expand Down
23 changes: 23 additions & 0 deletions api/yelp_beans/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,32 @@ class User(db.Model):
terminated = db.Column(db.Boolean, nullable=False, default=False)
subscription_preferences = db.relationship("UserSubscriptionPreferences")

# Additional fields for match algo
languages = db.Column(db.Text)
days_since_start = db.Column(db.Integer)
employee_id = db.Column(db.String())
location = db.Column(db.String())
manager_id = db.Column(db.String())

def get_username(self):
return self.email.split("@")[0]

def serialize(self):
return {
"id": self.id,
"email": self.email,
"first_name": self.first_name,
"last_name": self.last_name,
"photo_url": self.photo_url,
"meta_data": self.meta_data,
"terminated": self.terminated,
"languages": self.languages,
"days_since_start": self.days_since_start,
"employee_id": self.employee_id,
"location": self.location,
"manager_id": self.manager_id,
}


class MeetingSubscription(db.Model):
"""The base template for a meeting type, it is comprised of
Expand Down