Fix #24, refactor URL detection

fix-ismusicvideo-detection v1.3.0
Max Nuding 2023-04-24 19:38:13 +02:00
parent 9bbcc843c2
commit 68aade4f1f
Signed by: phlaym
GPG Key ID: A06651BAB6777237
6 changed files with 696 additions and 428 deletions

View File

@ -1,7 +1,5 @@
HASHTAG_FILTER = ichlausche,music,musik,nowplaying,tunetuesday,nowlistening
URL_FILTER = song.link,album.link,spotify.com,music.apple.com,bandcamp.com,songwhip.com
YOUTUBE_API_KEY = CHANGE_ME
YOUTUBE_DISABLE = false
ODESLI_API_KEY = CHANGE_ME
MASTODON_INSTANCE = 'metalhead.club'
BASE_URL = 'https://moshingmammut.phlaym.net'

View File

@ -11,8 +11,8 @@ Having a quick overview over what is being posted can be a great way to discover
This is fairly simple from a technical point of view! metalhead.club's local timeline is being watched using the
Mastodon Streaming API over a Websocket. Every time a new post arrives, it is checked if it contains any music by
checking included hashtags and URLs. A list of tags and URLs can be found in [the configuration](.env.EXAMPLE).
Additionally, lins to YouTube are queried, if they are music or other videos using the YouTube API.
checking included hashtags and URLs. A list of tags can be found in [the configuration](.env.EXAMPLE).
Additionally, links are vetted if they are music by checking if https://song.link finds info on them.
If a post passes this check it is saved to a SQLite database.
@ -93,11 +93,12 @@ and set your `User`, `Group`, `ExecStart` and `WorkingDirectory` accordingly.
#### On your development machine
Copy `.env.EXAMPLE` to `.env` and add your `YOUTUBE_API_KEY`.
Copy `.env.EXAMPLE` to `.env` and add your `YOUTUBE_API_KEY` and `ODESLI_API_KEY`.
To obtain one follow [YouTube's guide](https://developers.google.com/youtube/registering_an_application) to create an
_API key_.
If `YOUTUBE_API_KEY` is unset, all YouTube videos will be assumed to contain music links.
If this is unwanted, set `YOUTUBE_DISABLE` to `true`).
If `YOUTUBE_API_KEY` is unset, no playlist will be updated.
If `ODESLI_API_KEY` is unset, your rate limit to the song.link API will be lower.
Run `npm run build` and copy the output folder, usually `build` to `$APP_DIR` on your server.

903
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
{
"name": "moshing-mammut",
"version": "1.1.0",
"version": "1.3.0",
"private": true,
"license": "LGPL-3.0-or-later",
"scripts": {

View File

@ -1,89 +1,17 @@
import {
HASHTAG_FILTER,
MASTODON_INSTANCE,
ODESLI_API_KEY,
URL_FILTER,
YOUTUBE_API_KEY,
YOUTUBE_DISABLE
} from '$env/static/private';
import { HASHTAG_FILTER, MASTODON_INSTANCE, ODESLI_API_KEY } from '$env/static/private';
import { log } from '$lib/log';
import type { Post, Tag, TimelineEvent } from '$lib/mastodon/response';
import type { OdesliResponse, Platform, SongInfo } from '$lib/odesliResponse';
import { getPosts, savePost } from '$lib/server/db';
import { createFeed, saveAtomFeed } from '$lib/server/rss';
import { sleep } from '$lib/sleep';
import { isTruthy } from '$lib/truthyString';
import { WebSocket } from 'ws';
const YOUTUBE_REGEX = new RegExp(
/https?:\/\/(www\.)?youtu((be.com\/.*?v=)|(\.be\/))(?<videoId>[a-zA-Z_0-9-]+)/gm
);
const URL_REGEX = new RegExp(/href="(?<postUrl>[^>]+?)" target="_blank"/gm);
export class TimelineReader {
private static _instance: TimelineReader;
private static async isMusicVideo(videoId: string) {
if (!YOUTUBE_API_KEY || YOUTUBE_API_KEY === 'CHANGE_ME') {
// Assume that it *is* a music link when no YT API key is provided
// If it should assumed to not be YOUTUBE_DISABLE needs to be set to something truthy
return true;
}
const searchParams = new URLSearchParams([
['part', 'snippet'],
['id', videoId],
['key', YOUTUBE_API_KEY]
]);
const youtubeVideoUrl = new URL(`https://www.googleapis.com/youtube/v3/videos?${searchParams}`);
const resp = await fetch(youtubeVideoUrl);
const respObj = await resp.json();
if (!respObj.items.length) {
log.warn('Could not find video with id', videoId);
return false;
}
const item = respObj.items[0];
if (item.tags?.includes('music')) {
return true;
}
const categorySearchParams = new URLSearchParams([
['part', 'snippet'],
['id', item.categoryId],
['key', YOUTUBE_API_KEY]
]);
const youtubeCategoryUrl = new URL(
`https://www.googleapis.com/youtube/v3/videoCategories?${categorySearchParams}`
);
const categoryTitle: string = await fetch(youtubeCategoryUrl)
.then((r) => r.json())
.then((r) => r.items[0]?.title);
return categoryTitle === 'Music';
}
private static async checkYoutubeMatches(postContent: string): Promise<string | null> {
if (isTruthy(YOUTUBE_DISABLE)) {
return null;
}
const matches = postContent.matchAll(YOUTUBE_REGEX);
for (const match of matches) {
if (match === undefined || match.groups === undefined) {
continue;
}
const videoId = match.groups.videoId.toString();
try {
const isMusic = await TimelineReader.isMusicVideo(videoId);
if (isMusic) {
return match[0];
}
} catch (e) {
log.error('Could not check if', videoId, 'is a music video', e);
}
}
return null;
}
private static async getSongInfo(url: URL, remainingTries = 6): Promise<SongInfo | null> {
if (remainingTries === 0) {
log.error('No tries remaining. Lookup failed!');
@ -109,16 +37,18 @@ export class TimelineReader {
if (response.status === 429) {
throw new Error('Rate limit reached', { cause: 429 });
}
return response.json().then((odesliInfo: OdesliResponse) => {
const info = odesliInfo.entitiesByUniqueId[odesliInfo.entityUniqueId];
const platform: Platform = 'youtube';
return {
...info,
pageUrl: odesliInfo.pageUrl,
youtubeUrl: odesliInfo.linksByPlatform[platform]?.url,
postedUrl: url.toString()
} as SongInfo;
});
const odesliInfo: OdesliResponse = await response.json();
if (!odesliInfo || !odesliInfo.entitiesByUniqueId || !odesliInfo.entityUniqueId) {
return null;
}
const info = odesliInfo.entitiesByUniqueId[odesliInfo.entityUniqueId];
const platform: Platform = 'youtube';
return {
...info,
pageUrl: odesliInfo.pageUrl,
youtubeUrl: odesliInfo.linksByPlatform[platform]?.url,
postedUrl: url.toString()
} as SongInfo;
});
} catch (e) {
if (e instanceof Error && e.cause === 429) {
@ -131,24 +61,6 @@ export class TimelineReader {
}
}
private static async getUrlFromPreviewCard(post: Post): Promise<string | undefined> {
return undefined;
// Currently disabled, because it seems to always be null, even after re-fetching the post from Mastodon
/*
if (post.card) {
return post.card?.url;
}
try {
const status: Post = await (
await fetch(`https://${MASTODON_INSTANCE}/api/v1/statuses/${post.id}`)
).json();
return status.card?.url;
} catch (e) {
log.error(`Could not fetch status ${post.url}`, e);
}
*/
}
private startWebsocket() {
const socket = new WebSocket(`wss://${MASTODON_INSTANCE}/api/v1/streaming`);
socket.onopen = () => {
@ -165,74 +77,27 @@ export class TimelineReader {
const hashttags: string[] = HASHTAG_FILTER.split(',');
const found_tags: Tag[] = post.tags.filter((t: Tag) => hashttags.includes(t.name));
const urls: string[] = URL_FILTER.split(',');
const found_urls = urls.filter((t) => post.content.includes(t));
// If we don't have any tags or non-youtube urls, check youtube
// YT is handled separately, because it requires an API call and therefore is slower
if (found_urls.length === 0 && found_tags.length === 0) {
const youtubeUrl = await TimelineReader.checkYoutubeMatches(post.content);
if (youtubeUrl === null) {
log.log('Ignoring post', post.url);
return;
}
log.debug('Found YT URL', youtubeUrl, found_urls, found_urls.length);
} else {
log.debug('Found URLs and/or tags:', found_urls, found_tags);
}
// TODO: Change URL detection above to use this regex.
// Looks like we're stuck with regex for now instead of using preview cards.
// Might as well use it to find URLs. Could also use this for YouTube: If Odesli finds something, it's a song,
// if not, ignore it. No need to consult the YT API and give those links a special handling
const musicUrls: URL[] = [];
const musicUrl = await TimelineReader.getUrlFromPreviewCard(post);
if (musicUrl) {
try {
musicUrls.push(new URL(musicUrl));
} catch (e) {
log.error(
'URL received from preview card does not seem to be a valid URL',
musicUrl,
e
);
}
} else {
const urlMatches = post.content.matchAll(URL_REGEX);
for (const match of urlMatches) {
if (match === undefined || match.groups === undefined) {
console.warn(
'Match listed in allMatches, but either it or its groups are undefined',
match
);
continue;
}
const urlMatch = match.groups.postUrl.toString();
let url: URL;
try {
url = new URL(urlMatch);
} catch (e) {
log.error('URL found via Regex does not seem to be a valud url', urlMatch, e);
continue;
}
// Check *all* found url and let odesli determine if it is music or not
musicUrls.push(url);
}
}
const urlMatches = post.content.matchAll(URL_REGEX);
const songs: SongInfo[] = [];
log.debug(`Checking ${musicUrls.length} URLs if they contain song data`);
for (const url of musicUrls) {
let hostname: string | null = null;
try {
hostname = new URL(url).hostname;
} catch (e) {
log.error(`Could not check hostname for URL ${url}`, e);
}
if (hostname === 'songwhip.com') {
// TODO: Implement checking the songwhip API
for (const match of urlMatches) {
if (match === undefined || match.groups === undefined) {
log.warn(
'Match listed in allMatches, but either it or its groups are undefined',
match
);
continue;
}
const urlMatch = match.groups.postUrl.toString();
let url: URL;
try {
url = new URL(urlMatch);
} catch (e) {
log.error('URL found via Regex does not seem to be a valud url', urlMatch, e);
continue;
}
// Check *all* found url and let odesli determine if it is music or not
log.debug(`Checking ${url} if it contains song data`);
const info = await TimelineReader.getSongInfo(url);
log.debug(`Found song info for ${url}?`, info);
if (info) {
@ -240,6 +105,13 @@ export class TimelineReader {
}
}
// If we don't have any tags or non-youtube urls, check youtube
// YT is handled separately, because it requires an API call and therefore is slower
if (songs.length === 0 && found_tags.length === 0) {
log.log('Ignoring post', post.url);
return;
}
await savePost(post, songs);
log.debug('Saved post', post.url);

View File

@ -187,7 +187,7 @@
}
.post {
width: 100%;
max-width: 600px;
max-width: min(800px, 80vw);
margin-bottom: 1em;
border-bottom: 1px solid var(--color-border);
padding: 1em;