contentdb/app/tasks/phpbbparser.py

# Copyright (c) 2016  Andrew "rubenwardy" Ward
# License: MIT
# Source: https://github.com/rubenwardy/python_phpbb_parser

import urllib, socket
from bs4 import *
from urllib.parse import urljoin
from datetime import datetime
import urllib.request
import os.path
import time, re

def urlEncodeNonAscii(b):
	return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)

class Profile:
	def __init__(self, username):
		self.username   = username
		self.signature  = ""
		self.avatar     = None
		self.properties = {}

	def set(self, key, value):
		self.properties[key] = value

	def get(self, key):
		return self.properties[key] if key in self.properties else None

	def __str__(self):
		return self.username + "\n" + str(self.signature) + "\n" + str(self.properties)

def __extract_properties(profile, soup):
	el = soup.find(id="viewprofile")
	if el is None:
		return None

	res1 = el.find_all("dl")
	imgs = res1[0].find_all("img")
	if len(imgs) == 1:
		profile.avatar = imgs[0]["src"]

	res = el.find_all("dl", class_ = "left-box details")
	if len(res) != 1:
		return None

	catch_next_key = None

	# Look through
	for element in res[0].children:
		if element.name == "dt":
			if catch_next_key is None:
				catch_next_key = element.text.lower()[:-1].strip()
			else:
				print("Unexpected dt!")

		elif element.name == "dd":
			if catch_next_key is None:
				print("Unexpected dd!")
			else:
				if catch_next_key != "groups":
					profile.set(catch_next_key, element.text)
				catch_next_key = None

		elif element and element.name is not None:
			print("Unexpected other")

def __extract_signature(soup):
	res = soup.find_all("div", class_="signature")
	if (len(res) != 1):
		return None
	else:
		return res[0]

def getProfile(url, username):
	url = url + "/memberlist.php?mode=viewprofile&un=" + urlEncodeNonAscii(username)

	contents = urllib.request.urlopen(url).read().decode("utf-8")
	soup = BeautifulSoup(contents, "lxml")
	if soup is None:
		return None
	else:
		profile = Profile(username)
		profile.signature = __extract_signature(soup)
		__extract_properties(profile, soup)

		return profile


regex_id = re.compile(r"^.*t=([0-9]+).*$")

def parseForumListPage(id, page, out, extra=None):
	num_per_page = 30
	start = page*num_per_page+1
	print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))

	url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
	r = urllib.request.urlopen(url).read().decode("utf-8")
	soup = BeautifulSoup(r, "html.parser")

	for row in soup.find_all("li", class_="row"):
		classes = row.get("class")
		if "sticky" in classes or "announce" in classes or "global-announce" in classes:
			continue

		topic = row.find("dl")

		# Link info
		link   = topic.find(class_="topictitle")
		id	   = regex_id.match(link.get("href")).group(1)
		title  = link.find(text=True)

		# Date
		left   = topic.find("dt")
		date   = left.get_text().split("»")[1].strip()
		date   = datetime.strptime(date, "%a %b %d, %Y %H:%M")
		author = left.find_all("a")[-1].get_text().strip()

		# Get counts
		posts  = topic.find(class_="posts").find(text=True)
		views  = topic.find(class_="views").find(text=True)

		if id in out:
			print("   - got {} again, title: {}".format(id, title))
			assert title == out[id]['title']
			return False

		row = {
			"id"    : id,
			"title" : title,
			"author": author,
			"posts" : posts,
			"views" : views,
			"date"  : date
		}

		if extra is not None:
			for key, value in extra.items():
				row[key] = value

		out[id] = row

	return True

def getTopicsFromForum(id, out={}, extra=None):
	print("Fetching all topics from forum {}".format(id))
	page = 0
	while parseForumListPage(id, page, out, extra):
		page = page + 1

	return out

def dumpTitlesToFile(topics, path):
	with open(path, "w") as out_file:
		for topic in topics.values():
			out_file.write(topic["title"] + "\n")
Add license to all JS/py files 2018-05-17 16:18:20 +02:00			`# Copyright (c) 2016 Andrew "rubenwardy" Ward`
			`# License: MIT`
			`# Source: https://github.com/rubenwardy/python_phpbb_parser`

Add user account claiming 2018-05-14 00:31:42 +02:00			`import urllib, socket`
			`from bs4 import *`
			`from urllib.parse import urljoin`
Implement forum parser to increase accuracy 2018-07-04 01:14:37 +02:00			`from datetime import datetime`
Add user account claiming 2018-05-14 00:31:42 +02:00			`import urllib.request`
			`import os.path`
Add import users from Krock's mod list feature 2018-05-15 16:00:12 +02:00			`import time, re`

			`def urlEncodeNonAscii(b):`
			`return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)`
Add user account claiming 2018-05-14 00:31:42 +02:00
			`class Profile:`
			`def __init__(self, username):`
Add support for using forum profile pictures 2018-12-25 20:28:32 +01:00			`self.username = username`
			`self.signature = ""`
			`self.avatar = None`
Add user account claiming 2018-05-14 00:31:42 +02:00			`self.properties = {}`

			`def set(self, key, value):`
			`self.properties[key] = value`

			`def get(self, key):`
			`return self.properties[key] if key in self.properties else None`

			`def __str__(self):`
			`return self.username + "\n" + str(self.signature) + "\n" + str(self.properties)`

			`def __extract_properties(profile, soup):`
			`el = soup.find(id="viewprofile")`
			`if el is None:`
			`return None`

Add support for using forum profile pictures 2018-12-25 20:28:32 +01:00			`res1 = el.find_all("dl")`
			`imgs = res1[0].find_all("img")`
			`if len(imgs) == 1:`
			`profile.avatar = imgs[0]["src"]`

Add user account claiming 2018-05-14 00:31:42 +02:00			`res = el.find_all("dl", class_ = "left-box details")`
			`if len(res) != 1:`
			`return None`

			`catch_next_key = None`

			`# Look through`
			`for element in res[0].children:`
			`if element.name == "dt":`
			`if catch_next_key is None:`
			`catch_next_key = element.text.lower()[:-1].strip()`
			`else:`
			`print("Unexpected dt!")`

			`elif element.name == "dd":`
			`if catch_next_key is None:`
			`print("Unexpected dd!")`
			`else:`
			`if catch_next_key != "groups":`
			`profile.set(catch_next_key, element.text)`
			`catch_next_key = None`

			`elif element and element.name is not None:`
			`print("Unexpected other")`

			`def __extract_signature(soup):`
			`res = soup.find_all("div", class_="signature")`
			`if (len(res) != 1):`
			`return None`
			`else:`
			`return res[0]`

			`def getProfile(url, username):`
Add import users from Krock's mod list feature 2018-05-15 16:00:12 +02:00			`url = url + "/memberlist.php?mode=viewprofile&un=" + urlEncodeNonAscii(username)`
Add user account claiming 2018-05-14 00:31:42 +02:00
			`contents = urllib.request.urlopen(url).read().decode("utf-8")`
			`soup = BeautifulSoup(contents, "lxml")`
			`if soup is None:`
			`return None`
			`else:`
			`profile = Profile(username)`
			`profile.signature = __extract_signature(soup)`
			`__extract_properties(profile, soup)`

			`return profile`
Implement forum parser to increase accuracy 2018-07-04 01:14:37 +02:00

			`regex_id = re.compile(r"^.t=([0-9]+).$")`

			`def parseForumListPage(id, page, out, extra=None):`
			`num_per_page = 30`
			`start = page*num_per_page+1`
			`print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))`

			`url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)`
			`r = urllib.request.urlopen(url).read().decode("utf-8")`
			`soup = BeautifulSoup(r, "html.parser")`

			`for row in soup.find_all("li", class_="row"):`
			`classes = row.get("class")`
			`if "sticky" in classes or "announce" in classes or "global-announce" in classes:`
			`continue`

			`topic = row.find("dl")`

			`# Link info`
			`link = topic.find(class_="topictitle")`
			`id = regex_id.match(link.get("href")).group(1)`
			`title = link.find(text=True)`

			`# Date`
			`left = topic.find("dt")`
			`date = left.get_text().split("»")[1].strip()`
			`date = datetime.strptime(date, "%a %b %d, %Y %H:%M")`
			`author = left.find_all("a")[-1].get_text().strip()`

			`# Get counts`
			`posts = topic.find(class_="posts").find(text=True)`
			`views = topic.find(class_="views").find(text=True)`

			`if id in out:`
			`print(" - got {} again, title: {}".format(id, title))`
Add reloading support to Docker container 2020-01-18 02:38:00 +01:00			`assert title == out[id]['title']`
Implement forum parser to increase accuracy 2018-07-04 01:14:37 +02:00			`return False`

			`row = {`
			`"id" : id,`
			`"title" : title,`
			`"author": author,`
			`"posts" : posts,`
			`"views" : views,`
			`"date" : date`
			`}`

			`if extra is not None:`
			`for key, value in extra.items():`
			`row[key] = value`

			`out[id] = row`

Fix accidental regression in phpbbparser 2018-08-25 22:24:59 +02:00			`return True`
Implement forum parser to increase accuracy 2018-07-04 01:14:37 +02:00
			`def getTopicsFromForum(id, out={}, extra=None):`
			`print("Fetching all topics from forum {}".format(id))`
			`page = 0`
			`while parseForumListPage(id, page, out, extra):`
			`page = page + 1`

			`return out`

			`def dumpTitlesToFile(topics, path):`
			`with open(path, "w") as out_file:`
			`for topic in topics.values():`
			`out_file.write(topic["title"] + "\n")`