Re: How to properly retrieve data using requests + bs4 from multiple pages in a site?

2016-12-03 Thread Juan C.
On Thu, Dec 1, 2016 at 10:07 PM, Juan C.  wrote:
> It works, but it has a big issue: it gets all data from all
units/courses/assignments at the same time, and this isn't very useful as I
don't care about data from units from 1-2 years ago. How can I change the
logic so it just gets the data I need at a given moment? For example, I may
need to dump data for an entire unit, or just one course, or maybe even
just one assignment. How can I achieve this behavior? Another "issue", I
feel like handing my 'session' that I instantiated at user.py to program,
then unit, then course and then assignment is a poor design, how can I make
it better?
>
> Any other suggestions are welcome.

Oh, forgot to tell, I'm using Python 3.5.2 x64.
-- 
https://mail.python.org/mailman/listinfo/python-list


How to properly retrieve data using requests + bs4 from multiple pages in a site?

2016-12-01 Thread Juan C.
I'm a student and my university uses Moodle as their learning management
system (LMS). They don't have Moodle Web Services enabled and won't be
enabling it anytime soon, at least for students. The university programs
have the following structure, for example:

1. Bachelor's Degree in Computer Science (duration: 8 semesters)

1.1. Unit 01: Mathematics Fundamental (duration: 1 semester)
1.1.1. Algebra I (first 3 months)
1.1.2. Algebra II (first 3 months)
1.1.3. Calculus I (last 3 months)
1.1.4. Calculus II (last 3 months)
1.1.5. Unit Project (throughout the semester)

1.2. Unit 02: Programming (duration: 1 semester)
1.2.1. Programming Logic (first 3 months)
1.2.2. Data Modelling with UML (first 3 months)
1.2.3. Python I (last 3 months)
1.2.4. Python II (last 3 months)
1.2.5. Unit Project (throughout the semester)

Each course/project have a bunch of assignments + one final assignment.
This goes on, totalizing 8 (eight) units, which will make up for a 4-year
program. I'm building my own client-side Moodle API to be consumed by my
scripts. Currently I'm using 'requests' + 'bs4' to do the job. My code:

package moodle/

user.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from .program import Program
import requests


class User:
   _AUTH_URL = 'http://lms.university.edu/moodle/login/index.php'

   def __init__(self, username, password, program_id):
  self.username = username
  self.password = password
  session = requests.session()
  session.post(self._AUTH_URL, {"username": username, "password":
password})
  self.program = Program(program_id=program_id, session=session)

   def __str__(self):
  return self.username + ':' + self.password

   def __repr__(self):
  return '' % self.username

   def __eq__(self, other):
  if isinstance(other, self):
 return self.username == other.username
  else:
 return False

==

program.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from .unit import Unit
from bs4 import BeautifulSoup


class Program:
   _PATH = 'http://lms.university.edu/moodle/course/index.php?categoryid='

   def __init__(self, program_id, session):
  response = session.get(self._PATH + str(program_id))
  soup = BeautifulSoup(response.text, 'html.parser')

  self.name = soup.find('ul',
class_='breadcrumb').find_all('li')[-2].text.replace('/', '').strip()
  self.id = program_id
  self.units = [Unit(int(item['data-categoryid']), session) for item in
soup.find_all('div', {'class': 'category'})]

   def __str__(self):
  return self.name

   def __repr__(self):
  return '' % (self.name, self.id)

   def __eq__(self, other):
  if isinstance(other, self):
 return self.id == other.id
  else:
 return False

==

unit.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from .course import Course
from bs4 import BeautifulSoup


class Unit:
   _PATH = 'http://lms.university.edu/moodle/course/index.php?categoryid='

   def __init__(self, unit_id, session):
  response = session.get(self._PATH + str(unit_id))
  soup = BeautifulSoup(response.text, 'html.parser')

  self.name = soup.find('ul',
class_='breadcrumb').find_all('li')[-1].text.replace('/', '').strip()
  self.id = unit_id
  self.courses = [Course(int(item['data-courseid']), session) for item
in soup.find_all('div', {'class': 'coursebox'})]

   def __str__(self):
  return self.name

   def __repr__(self):
  return '' % (self.name, self.id)

   def __eq__(self, other):
  if isinstance(other, self):
 return self.id == other.id
  else:
 return False

==

course.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-


from .assignment import Assignment
import re
from bs4 import BeautifulSoup


class Course:
   _PATH = 'http://lms.university.edu/moodle/course/view.php?id='

   def __init__(self, course_id, session):
  response = session.get(self._PATH + str(course_id))
  soup = BeautifulSoup(response.text, 'html.parser')

  self.name = soup.find('h1').text
  self.id = course_id
  self.assignments = [Assignment(int(item['href'].split('id=')[-1]),
session) for item in
 soup.find_all('a', href=re.compile(r'http://lms
\.university\.edu/moodle/mod/assign/view.php\?id=.*'))]

   def __str__(self):
  return self.name

   def __repr__(self):
  return '' % (self.name, self.id)

   def __eq__(self, other):
  if isinstance(other, self):
 return self.id == other.id
  else:
 return False

==

assignment.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup


class Assignment:
   _PATH = 'http://lms.university.edu/moodle/mod/assign/view.php?id='

   def __init__(self, assignment_id, session):
  response = session.get(self._PATH + str(assignment_id))
  soup = BeautifulSoup(response.text, 'html.parser')

  self.name = soup.find('h2').text
  self.id = assignment_id
  self.sent = soup.find('td',