From 19e1ed8b32179e3317c807b3ab0581e3b5fb00a2 Mon Sep 17 00:00:00 2001
From: rubenwardy <rw@rubenwardy.com>
Date: Wed, 4 Jul 2018 00:14:37 +0100
Subject: [PATCH] Implement forum parser to increase accuracy

---
 app/models.py                        |  16 ++--
 app/tasks/forumtasks.py              | 127 +++++++++++++++------------
 app/tasks/phpbbparser.py             |  70 +++++++++++++++
 app/templates/admin/list.html        |   3 +-
 app/templates/macros/topictable.html |   4 +-
 app/templates/packages/view.html     |   2 +-
 app/views/admin.py                   |   9 +-
 app/views/packages/__init__.py       |   8 +-
 app/views/packages/todo.py           |  12 +--
 app/views/users.py                   |   6 +-
 migrations/versions/9fc23495713b_.py |  55 ++++++++++++
 11 files changed, 226 insertions(+), 86 deletions(-)
 create mode 100644 migrations/versions/9fc23495713b_.py

diff --git a/app/models.py b/app/models.py
index 5332dbd..b5b4836 100644
--- a/app/models.py
+++ b/app/models.py
@@ -743,23 +743,25 @@ REPO_BLACKLIST = [".zip", "mediafire.com", "dropbox.com", "weebly.com", \
 		"digitalaudioconcepts.com", "hg.intevation.org", "www.wtfpl.net", \
 		"imageshack.com", "imgur.com"]
 
-class KrockForumTopic(db.Model):
+class ForumTopic(db.Model):
 	topic_id  = db.Column(db.Integer, primary_key=True, autoincrement=False)
 	author_id = db.Column(db.Integer, db.ForeignKey("user.id"), nullable=False)
 	author    = db.relationship("User")
 
-	ttype     = db.Column(db.Integer, nullable=False)
+	type      = db.Column(db.Enum(PackageType), nullable=False)
 	title     = db.Column(db.String(200), nullable=False)
 	name      = db.Column(db.String(30), nullable=True)
 	link      = db.Column(db.String(200), nullable=True)
 
-	def getType(self):
-		if self.ttype == 1 or self.ttype == 2:
-			return PackageType.MOD
-		elif self.ttype == 6:
-			return PackageType.GAME
+	posts     = db.Column(db.Integer, nullable=False)
+	views     = db.Column(db.Integer, nullable=False)
+
+	created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow)
 
 	def getRepoURL(self):
+		if self.link is None:
+			return None
+
 		for item in REPO_BLACKLIST:
 			if item in self.link:
 				return None
diff --git a/app/tasks/forumtasks.py b/app/tasks/forumtasks.py
index b2e0ca8..5513fb2 100644
--- a/app/tasks/forumtasks.py
+++ b/app/tasks/forumtasks.py
@@ -15,12 +15,12 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 
-import flask, json
+import flask, json, re
 from flask.ext.sqlalchemy import SQLAlchemy
 from app import app
 from app.models import *
 from app.tasks import celery
-from .phpbbparser import getProfile
+from .phpbbparser import getProfile, getTopicsFromForum
 import urllib.request
 from urllib.parse import urlparse, quote_plus
 
@@ -51,71 +51,88 @@ def checkForumAccount(username, token=None):
 	if needsSaving:
 		db.session.commit()
 
-@celery.task()
-def importUsersFromModList():
+
+regex_tag    = re.compile(r"\[([a-z0-9_]+)\]")
+BANNED_NAMES = ["mod", "game", "old", "outdated", "wip", "api", "beta", "alpha", "git"]
+def getNameFromTaglist(taglist):
+	for tag in reversed(regex_tag.findall(taglist)):
+		if len(tag) < 30 and not tag in BANNED_NAMES and \
+				not re.match(r"^[a-z]?[0-9]+$", tag):
+			return tag
+
+	return None
+
+regex_title = re.compile(r"^((?:\[[^\]]+\] *)*)([^\[]+) *((?:\[[^\]]+\] *)*)[^\[]*$")
+def parseTitle(title):
+	m = regex_title.match(title)
+	if m is None:
+		print("Invalid title format: " + title)
+		return title, getNameFromTaglist(title)
+	else:
+		return m.group(2).strip(), getNameFromTaglist(m.group(3))
+
+def getLinksFromModSearch():
+	links = {}
+
 	contents = urllib.request.urlopen("http://krock-works.16mb.com/MTstuff/modList.php").read().decode("utf-8")
-	list = json.loads(contents)
-	found = {}
-	imported = []
+	for x in json.loads(contents):
+		link = x.get("link")
+		if link is not None:
+			links[int(x["topicId"])] = link
 
-	for user in User.query.all():
-		found[user.username] = True
-		if user.forums_username is not None:
-			found[user.forums_username] = True
-
-	for x in list:
-		author = x.get("author")
-		if author is not None and not author in found:
-			user = User(author)
-			user.forums_username = author
-			imported.append(author)
-			found[author] = True
-			db.session.add(user)
-
-	db.session.commit()
-	for author in found:
-		checkForumAccount.delay(author, None)
-
-
-BANNED_NAMES = ["mod", "game", "old", "outdated", "wip", "api"]
-ALLOWED_TYPES = [1, 2, 6]
+	return links
 
 @celery.task()
-def importKrocksModList():
-	contents = urllib.request.urlopen("http://krock-works.16mb.com/MTstuff/modList.php").read().decode("utf-8")
-	list = json.loads(contents)
+def importTopicList():
+	links_by_id = getLinksFromModSearch()
+
+	info_by_id = {}
+	getTopicsFromForum(11, out=info_by_id, extra={ 'type': PackageType.MOD })
+	getTopicsFromForum(15, out=info_by_id, extra={ 'type': PackageType.GAME })
+
+	# Caches
 	username_to_user = {}
+	topics_by_id     = {}
+	for topic in ForumTopic.query.all():
+		topics_by_id[topic.topic_id] = topic
 
-	KrockForumTopic.query.delete()
+	# Create or update
+	for info in info_by_id.values():
+		id = int(info["id"])
 
-	for x in list:
-		type = int(x["type"])
-		if not type in ALLOWED_TYPES:
-			continue
-
-		username = x["author"]
+		# Get author
+		username = info["author"]
 		user = username_to_user.get(username)
 		if user is None:
 			user = User.query.filter_by(forums_username=username).first()
-			assert(user is not None)
+			if user is None:
+				print(username + " not found!")
+				user = User(username)
+				user.forums_username = username
+				db.session.add(user)
 			username_to_user[username] = user
 
-		import re
-		tags = re.findall("\[([a-z0-9_]+)\]", x["title"])
-		name = None
-		for tag in reversed(tags):
-			if len(tag) < 30 and not tag in BANNED_NAMES and \
-					not re.match("^([a-z][0-9]+)$", tag):
-				name = tag
-				break
+		# Get / add row
+		topic = topics_by_id.get(id)
+		if topic is None:
+			topic = ForumTopic()
+			db.session.add(topic)
 
-		topic = KrockForumTopic()
-		topic.topic_id  = x["topicId"]
-		topic.author_id = user.id
-		topic.ttype     = type
-		topic.title     = x["title"]
-		topic.name      = name
-		topic.link      = x.get("link")
-		db.session.add(topic)
+		# Parse title
+		title, name = parseTitle(info["title"])
+
+		# Get link
+		link = links_by_id.get(id)
+
+		# Fill row
+		topic.topic_id   = id
+		topic.author     = user
+		topic.type       = info["type"]
+		topic.title      = title
+		topic.name       = name
+		topic.link       = link
+		topic.posts      = info["posts"]
+		topic.views      = info["views"]
+		topic.created_at = info["date"]
 
 	db.session.commit()
diff --git a/app/tasks/phpbbparser.py b/app/tasks/phpbbparser.py
index d27ccec..9984ad0 100644
--- a/app/tasks/phpbbparser.py
+++ b/app/tasks/phpbbparser.py
@@ -5,6 +5,7 @@
 import urllib, socket
 from bs4 import *
 from urllib.parse import urljoin
+from datetime import datetime
 import urllib.request
 import os.path
 import time, re
@@ -77,3 +78,72 @@ def getProfile(url, username):
 		__extract_properties(profile, soup)
 
 		return profile
+
+
+regex_id = re.compile(r"^.*t=([0-9]+).*$")
+
+def parseForumListPage(id, page, out, extra=None):
+	num_per_page = 30
+	start = page*num_per_page+1
+	print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))
+
+	url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
+	r = urllib.request.urlopen(url).read().decode("utf-8")
+	soup = BeautifulSoup(r, "html.parser")
+
+	for row in soup.find_all("li", class_="row"):
+		classes = row.get("class")
+		if "sticky" in classes or "announce" in classes or "global-announce" in classes:
+			continue
+
+		topic = row.find("dl")
+
+		# Link info
+		link   = topic.find(class_="topictitle")
+		id	   = regex_id.match(link.get("href")).group(1)
+		title  = link.find(text=True)
+
+		# Date
+		left   = topic.find("dt")
+		date   = left.get_text().split("»")[1].strip()
+		date   = datetime.strptime(date, "%a %b %d, %Y %H:%M")
+		author = left.find_all("a")[-1].get_text().strip()
+
+		# Get counts
+		posts  = topic.find(class_="posts").find(text=True)
+		views  = topic.find(class_="views").find(text=True)
+
+		if id in out:
+			print("   - got {} again, title: {}".format(id, title))
+			assert(title == out[id]['title'])
+			return False
+
+		row = {
+			"id"    : id,
+			"title" : title,
+			"author": author,
+			"posts" : posts,
+			"views" : views,
+			"date"  : date
+		}
+
+		if extra is not None:
+			for key, value in extra.items():
+				row[key] = value
+
+		out[id] = row
+
+	return True
+
+def getTopicsFromForum(id, out={}, extra=None):
+	print("Fetching all topics from forum {}".format(id))
+	page = 0
+	while parseForumListPage(id, page, out, extra):
+		page = page + 1
+
+	return out
+
+def dumpTitlesToFile(topics, path):
+	with open(path, "w") as out_file:
+		for topic in topics.values():
+			out_file.write(topic["title"] + "\n")
diff --git a/app/templates/admin/list.html b/app/templates/admin/list.html
index e5049f9..c565fe0 100644
--- a/app/templates/admin/list.html
+++ b/app/templates/admin/list.html
@@ -17,8 +17,7 @@
 		<form method="post" action="" class="box-body">
 			<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
 			<select name="action">
-				<option value="importusers">Create users from mod list</option>
-				<option value="importmodlist">Import Krock's mod list</option>
+				<option value="importmodlist">Import forum topics</option>
 				<option value="importscreenshots" selected>Import screenshots from VCS</option>
 				<option value="importdepends">Import dependencies from downloads</option>
 				<option value="modprovides">Set provides to mod name</option>
diff --git a/app/templates/macros/topictable.html b/app/templates/macros/topictable.html
index a0c5b1e..7ae8a35 100644
--- a/app/templates/macros/topictable.html
+++ b/app/templates/macros/topictable.html
@@ -11,12 +11,12 @@
 	{% for topic in topics %}
 		<tr>
 			<td>{{ topic.topic_id }}</td>
-			<td>[{{ topic.getType().value }}] <a href="https://forum.minetest.net/viewtopic.php?t={{ topic.topic_id}}">{{ topic.title }}</a></td>
+			<td>[{{ topic.type.value }}] <a href="https://forum.minetest.net/viewtopic.php?t={{ topic.topic_id}}">{{ topic.title }}</a></td>
 			{% if show_author %}
 				<td><a href="{{ url_for('user_profile_page', username=topic.author.username) }}">{{ topic.author.display_name}}</a></td>
 			{% endif %}
 			<td>{{ topic.name or ""}}</td>
-			<td><a href="{{ topic.link }}">{{ topic.link | domain }}</a></td>
+			<td>{% if topic.link %}<a href="{{ topic.link }}">{{ topic.link | domain }}</a>{% endif %}</td>
 			<td>
 				<a href="{{ url_for('create_edit_package_page', author=topic.author.username, repo=topic.getRepoURL(), forums=topic.topic_id, title=topic.title, bname=topic.name) }}">Create</a>
 			</td>
diff --git a/app/templates/packages/view.html b/app/templates/packages/view.html
index ab48c6e..f69b5cf 100644
--- a/app/templates/packages/view.html
+++ b/app/templates/packages/view.html
@@ -292,7 +292,7 @@
 		<ul>
 			{% for t in similar_topics %}
 				<li>
-					[{{ t.getType().value }}]
+					[{{ t.type.value }}]
 					<a href="https://forum.minetest.net/viewtopic.php?t={{ t.topic_id }}">
 						{{ t.title }} by {{ t.author.display_name }}
 					</a>
diff --git a/app/views/admin.py b/app/views/admin.py
index 65d5264..92ee437 100644
--- a/app/views/admin.py
+++ b/app/views/admin.py
@@ -21,7 +21,7 @@ from flask.ext import menu
 from app import app
 from app.models import *
 from app.tasks.importtasks import importRepoScreenshot, importAllDependencies
-from app.tasks.forumtasks  import importUsersFromModList, importKrocksModList
+from app.tasks.forumtasks  import importTopicList
 from flask_wtf import FlaskForm
 from wtforms import *
 from app.utils import loginUser, rank_required
@@ -31,11 +31,8 @@ from app.utils import loginUser, rank_required
 def admin_page():
 	if request.method == "POST":
 		action = request.form["action"]
-		if action == "importusers":
-			task = importUsersFromModList.delay()
-			return redirect(url_for("check_task", id=task.id, r=url_for("user_list_page")))
-		elif action == "importmodlist":
-			task = importKrocksModList.delay()
+		if action == "importmodlist":
+			task = importTopicList.delay()
 			return redirect(url_for("check_task", id=task.id, r=url_for("todo_topics_page")))
 		elif action == "importscreenshots":
 			packages = Package.query \
diff --git a/app/views/packages/__init__.py b/app/views/packages/__init__.py
index 6ef76ec..4d357a6 100644
--- a/app/views/packages/__init__.py
+++ b/app/views/packages/__init__.py
@@ -100,11 +100,11 @@ def package_page(package):
 			package.checkPerm(current_user, Permission.APPROVE_NEW)
 
 	similar_topics = None if not show_similar_topics else \
-			KrockForumTopic.query \
+			ForumTopic.query \
 				.filter_by(name=package.name) \
-				.filter(KrockForumTopic.topic_id != package.forums) \
-				.filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
-				.order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
+				.filter(ForumTopic.topic_id != package.forums) \
+				.filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
+				.order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
 				.all()
 
 	releases = getReleases(package)
diff --git a/app/views/packages/todo.py b/app/views/packages/todo.py
index 81735eb..84cfef4 100644
--- a/app/views/packages/todo.py
+++ b/app/views/packages/todo.py
@@ -41,8 +41,8 @@ def todo_page():
 		screenshots = PackageScreenshot.query.filter_by(approved=False).all()
 
 
-	topics_to_add = KrockForumTopic.query \
-			.filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
+	topics_to_add = ForumTopic.query \
+			.filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
 			.count()
 
 	return render_template("todo/list.html", title="Reports and Work Queue",
@@ -54,11 +54,11 @@ def todo_page():
 @app.route("/todo/topics/")
 @login_required
 def todo_topics_page():
-	total = KrockForumTopic.query.count()
+	total = ForumTopic.query.count()
 
-	topics = KrockForumTopic.query \
-			.filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
-			.order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
+	topics = ForumTopic.query \
+			.filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
+			.order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
 			.all()
 
 	return render_template("todo/topics.html", topics=topics, total=total)
diff --git a/app/views/users.py b/app/views/users.py
index 256f7d1..a96fce2 100644
--- a/app/views/users.py
+++ b/app/views/users.py
@@ -98,10 +98,10 @@ def user_profile_page(username):
 
 	topics_to_add = None
 	if current_user == user or user.checkPerm(current_user, Permission.CHANGE_AUTHOR):
-		topics_to_add = KrockForumTopic.query \
+		topics_to_add = ForumTopic.query \
 					.filter_by(author_id=user.id) \
-					.filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
-					.order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
+					.filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
+					.order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
 					.all()
 
 	# Process GET or invalid POST
diff --git a/migrations/versions/9fc23495713b_.py b/migrations/versions/9fc23495713b_.py
new file mode 100644
index 0000000..f457ae5
--- /dev/null
+++ b/migrations/versions/9fc23495713b_.py
@@ -0,0 +1,55 @@
+"""empty message
+
+Revision ID: 9fc23495713b
+Revises: de004661c5e1
+Create Date: 2018-07-04 00:03:20.123285
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '9fc23495713b'
+down_revision = 'de004661c5e1'
+branch_labels = None
+depends_on = None
+from sqlalchemy.dialects.postgresql import ENUM
+
+type_enum = ENUM('MOD', 'GAME', 'TXP', name='packagetype', create_type=False)
+
+def upgrade():
+    type_enum.create(op.get_bind(), checkfirst=True)
+
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('krock_forum_topic')
+    op.create_table('forum_topic',
+    sa.Column('topic_id', sa.Integer(), autoincrement=False, nullable=False),
+    sa.Column('author_id', sa.Integer(), nullable=False),
+    sa.Column('type', type_enum, nullable=True),
+    sa.Column('title', sa.String(length=200), nullable=False),
+    sa.Column('name', sa.String(length=30), nullable=True),
+    sa.Column('link', sa.String(length=200), nullable=True),
+    sa.Column('posts', sa.Integer(), nullable=False),
+    sa.Column('views', sa.Integer(), nullable=False),
+    sa.Column('created_at', sa.DateTime(), nullable=False),
+    sa.ForeignKeyConstraint(['author_id'], ['user.id'], ),
+    sa.PrimaryKeyConstraint('topic_id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('forum_topic')
+    op.create_table('krock_forum_topic',
+    sa.Column('topic_id', sa.Integer(), autoincrement=False, nullable=False),
+    sa.Column('author_id', sa.Integer(), nullable=False),
+    sa.Column('ttype', sa.Integer(), nullable=False),
+    sa.Column('title', sa.String(length=200), nullable=False),
+    sa.Column('name', sa.String(length=30), nullable=True),
+    sa.Column('link', sa.String(length=50), nullable=True),
+    sa.ForeignKeyConstraint(['author_id'], ['user.id'], ),
+    sa.PrimaryKeyConstraint('topic_id')
+    )
+    # ### end Alembic commands ###