Erster Checkin: Tool arbeitet
This commit is contained in:
174
app/extractor.py
Normal file
174
app/extractor.py
Normal file
@@ -0,0 +1,174 @@
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import csv
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from playwright.async_api import async_playwright, Page, Response
|
||||
|
||||
from .auth import AUTH_STATE_FILE
|
||||
from .parser import (
|
||||
parse_person_from_organization_person,
|
||||
parse_direct_emails_from_organization,
|
||||
parse_workingwith_entries,
|
||||
)
|
||||
from .models import Person, WorkingWithRelation
|
||||
|
||||
|
||||
DELV_PERSON_URL = "https://eur.loki.delve.office.com/api/v2/person"
|
||||
DELV_ORG_URL = "https://eur.loki.delve.office.com/api/v1/organization"
|
||||
DELV_WORKINGWITH_URL = "https://eur.loki.delve.office.com/api/v1/workingwith"
|
||||
|
||||
|
||||
async def _collect_json_from_responses(page: Page) -> Dict[str, Any]:
|
||||
collected: Dict[str, Any] = {}
|
||||
|
||||
async def handle_response(response: Response):
|
||||
url = response.url
|
||||
try:
|
||||
if url.startswith(DELV_PERSON_URL):
|
||||
collected["person"] = await response.json()
|
||||
elif url.startswith(DELV_ORG_URL):
|
||||
collected["organization"] = await response.json()
|
||||
elif url.startswith(DELV_WORKINGWITH_URL):
|
||||
collected["workingwith"] = await response.json()
|
||||
except Exception:
|
||||
# JSON-Parsing-Fehler ignorieren
|
||||
pass
|
||||
|
||||
page.on("response", handle_response)
|
||||
return collected
|
||||
|
||||
|
||||
async def _open_profile_and_collect(page: Page, email: str) -> Dict[str, Any]:
|
||||
collected = await _collect_json_from_responses(page)
|
||||
|
||||
await page.goto("https://m365.cloud.microsoft/search/?auth=2")
|
||||
|
||||
# Suche öffnen
|
||||
search_button = page.get_by_role("button", name="search")
|
||||
await search_button.click()
|
||||
|
||||
input_box = page.locator('input[type="text"]')
|
||||
await input_box.fill(f'Person:"{email}"')
|
||||
await input_box.press("Enter")
|
||||
|
||||
# Warten bis ein Profil-Button erscheint
|
||||
await page.wait_for_timeout(2000)
|
||||
profile_button = page.get_by_title("Organisation")
|
||||
'await profile_button.click()'
|
||||
|
||||
# Organisation-Tab
|
||||
org_button = page.locator('button[data-content="Organisation"]')
|
||||
await org_button.click()
|
||||
|
||||
# Zeit geben, Netzwerk-Calls zu sammeln
|
||||
await page.wait_for_timeout(4000)
|
||||
return collected
|
||||
|
||||
|
||||
async def extract_relations_for_manager(manager_email: str) -> List[WorkingWithRelation]:
|
||||
if not AUTH_STATE_FILE.exists():
|
||||
raise RuntimeError("auth_state.json nicht vorhanden, bitte zuerst login-Modus ausführen.")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser_type = p.chromium
|
||||
context = await browser_type.launch_persistent_context(
|
||||
user_data_dir="user_data",
|
||||
headless=False,
|
||||
channel="chrome"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
collected = await _open_profile_and_collect(page, manager_email)
|
||||
|
||||
person_json = collected.get("person") or {}
|
||||
org_json = collected.get("organization") or {}
|
||||
working_json = collected.get("workingwith") or {}
|
||||
|
||||
manager_person = parse_person_from_organization_person(person_json)
|
||||
print(manager_person)
|
||||
working_with_persons = parse_workingwith_entries(working_json)
|
||||
print(working_with_persons)
|
||||
directs_emails = parse_direct_emails_from_organization(org_json)
|
||||
print(directs_emails)
|
||||
|
||||
relations: List[WorkingWithRelation] = []
|
||||
|
||||
# Manager -> WorkingWith
|
||||
for dest in working_with_persons:
|
||||
relations.append(WorkingWithRelation(source=manager_person, destination=dest))
|
||||
|
||||
# Für alle Directs ebenfalls WorkingWith holen
|
||||
for direct_email in directs_emails:
|
||||
collected_direct = await _open_profile_and_collect(page, direct_email)
|
||||
person_json_d = collected_direct.get("person") or {}
|
||||
working_json_d = collected_direct.get("workingwith") or {}
|
||||
|
||||
direct_person = parse_person_from_organization_person(person_json_d)
|
||||
working_with_persons_d = parse_workingwith_entries(working_json_d)
|
||||
|
||||
for dest in working_with_persons_d:
|
||||
relations.append(WorkingWithRelation(source=direct_person, destination=dest))
|
||||
|
||||
await context.close()
|
||||
return relations
|
||||
|
||||
|
||||
async def extract_relations_for_emails(emails: List[str]) -> List[WorkingWithRelation]:
|
||||
if not AUTH_STATE_FILE.exists():
|
||||
raise RuntimeError("auth_state.json nicht vorhanden – bitte zuerst login-Modus ausführen.")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser_type = p.chromium
|
||||
context = await browser_type.launch_persistent_context(
|
||||
user_data_dir="user_data",
|
||||
headless=False,
|
||||
channel="chrome"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
relations: List[WorkingWithRelation] = []
|
||||
|
||||
for email in emails:
|
||||
collected = await _open_profile_and_collect(page, email)
|
||||
person_json = collected.get("person") or {}
|
||||
working_json = collected.get("workingwith") or {}
|
||||
|
||||
person = parse_person_from_organization_person(person_json)
|
||||
working_with_persons = parse_workingwith_entries(working_json)
|
||||
|
||||
for dest in working_with_persons:
|
||||
relations.append(WorkingWithRelation(source=person, destination=dest))
|
||||
|
||||
await context.close()
|
||||
return relations
|
||||
|
||||
|
||||
def write_relations_to_csv(relations: List[WorkingWithRelation], output_path: Path) -> None:
|
||||
fieldnames = [
|
||||
"source_mail",
|
||||
"source_displayname",
|
||||
"source_jobTitle",
|
||||
"source_department",
|
||||
"destination_mail",
|
||||
"destination_displayname",
|
||||
"destination_jobTitle",
|
||||
"destination_department",
|
||||
]
|
||||
with output_path.open("w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for rel in relations:
|
||||
writer.writerow(
|
||||
{
|
||||
"source_mail": rel.source.email,
|
||||
"source_displayname": rel.source.display_name,
|
||||
"source_jobTitle": rel.source.job_title,
|
||||
"source_department": rel.source.department,
|
||||
"destination_mail": rel.destination.email,
|
||||
"destination_displayname": rel.destination.display_name,
|
||||
"destination_jobTitle": rel.destination.job_title,
|
||||
"destination_department": rel.destination.department,
|
||||
}
|
||||
)
|
||||
Reference in New Issue
Block a user