You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
4.0 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
  1. import json
  2. import re
  3. import pathlib
  4. from warnings import warn
  5. import numpy as np
  6. import pandas as pd
  7. from tqdm import tqdm
  8. from moodle_dl.moodle_connector.request_helper import RequestHelper
  9. from moodle_dl.moodle_connector.first_contact_handler import FirstContactHandler
  10. from moodle_dl.moodle_connector.sso_token_receiver import extract_token
  11. def get_cds(c):
  12. regex = '^B\d+ \((B\d+)\) -.*'
  13. m = re.match(regex, c.fullname)
  14. if m:
  15. return m.groups(0)[0]
  16. def normalize_name(email, namestring):
  17. mailuser, domain = email.split('@')
  18. allnames = namestring.lower().split(' ')
  19. if domain == 'stud.unifi.it':
  20. firstname, lastname = mailuser.split('.')
  21. firstname = " ".join([ n.capitalize() for n in allnames if n in firstname ])
  22. lastname = " ".join([ n.capitalize() for n in allnames if n in lastname ])
  23. return firstname, lastname
  24. if len(allnames) == 2:
  25. firstname = namestring[1].capitalize()
  26. lastname = namestring[0].capitalize()
  27. return firstname, lastname
  28. return "", " ".join([a.capitalize() for a in allnames])
  29. def is_student(u):
  30. roleids = [r['roleid'] for r in u['roles']]
  31. return (5 in roleids and not u['email'].endswith('@unifi.it'))
  32. class MailListDownloader():
  33. def __init__(self, url):
  34. self.token, self.secret_token = extract_token(url)
  35. self.rh = RequestHelper('e-l.unifi.it', token=self.token)
  36. self.fch = FirstContactHandler(self.rh)
  37. def course_emails(self, courseid):
  38. users_raw = self.rh.post_REST('core_enrol_get_enrolled_users', dict(courseid=courseid))
  39. users = list()
  40. skipped = 0
  41. for u in users_raw:
  42. if not 'fullname' in u:
  43. print(f'Malformed record u={u}')
  44. continue
  45. if not 'email' in u:
  46. skipped += 1
  47. #print(f"Skipping {u['fullname']}, no available email.")
  48. continue
  49. if not is_student(u):
  50. skipped += 1
  51. #print(f"Skipping {u['fullname']}, not a student.")
  52. continue
  53. firstname, lastname = normalize_name(u['email'], u['fullname'])
  54. users.append(dict(firstname=firstname, lastname=lastname, email=u['email']))
  55. print(f"\t{len(users)} found, {skipped} skipped.")
  56. return users
  57. def course_list(self):
  58. userid, version = self.fch.fetch_userid_and_version()
  59. courses = self.fch.fetch_courses(userid)
  60. return [ dict(cid=c.id,cds=get_cds(c),fullname=c.fullname) for c in courses ]
  61. def download_from_new_courses(self, path):
  62. path = pathlib.Path(path)
  63. downloaded_file = path.with_suffix('.downloaded.json')
  64. database_file = path.with_suffix('.db.hdf')
  65. if downloaded_file.is_file():
  66. with downloaded_file.open() as f:
  67. downloaded = json.load(f)
  68. else:
  69. downloaded = []
  70. if database_file.is_file():
  71. db = pd.read_hdf(database_file, 'emails')
  72. else:
  73. db = pd.DataFrame()
  74. db['cds'] = []
  75. db['firstname'] = []
  76. db['lastname'] = []
  77. db['email'] = []
  78. for c in tqdm(self.course_list()):
  79. old_len = len(db)
  80. if c['cds'] and not c['cid'] in downloaded:
  81. print(f"Downloading {c['fullname']}")
  82. cid = c['cid']
  83. cds = c['cds']
  84. emails = self.course_emails(cid)
  85. emails = pd.DataFrame(emails)
  86. emails['cds'] = cds
  87. db = pd.concat([db, emails]).drop_duplicates().reset_index(drop=True)
  88. downloaded.append(cid)
  89. new_len = len(db)
  90. new = new_len - old_len
  91. duplicates = len(emails) - new
  92. print(f"\t{new_len-old_len} new, {duplicates} duplicates.")
  93. with downloaded_file.open('w') as f:
  94. json.dump(downloaded, f)
  95. db.to_hdf(database_file, key='emails')
  96. return db