Setup feeds, request, and filter them

This commit is contained in:
Lewis Dale 2024-09-20 13:22:05 +01:00
parent 657975324a
commit 70e78bcf42
10 changed files with 330 additions and 4 deletions

View File

@ -4,6 +4,8 @@ require_once __DIR__ . '/../vendor/autoload.php';
use Doctrine\ORM\EntityManager;
use Doctrine\ORM\Tools\Console\ConsoleRunner;
use Doctrine\ORM\Tools\Console\EntityManagerProvider\SingleManagerProvider;
use Lewisdale\App\Models\Repositories\FeedRepository;
use Lewisdale\App\Tools\Console\TestFeed;
$dotenv = Dotenv\Dotenv::createImmutable([__DIR__, __DIR__ . "/.."]);
$dotenv->load();
@ -12,6 +14,11 @@ $dotenv->load();
require_once __DIR__ . '/../src/dependencies.php';
global $container;
$commands = [
$container->get(TestFeed::class),
];
ConsoleRunner::run(
new SingleManagerProvider($container->get(EntityManager::class))
new SingleManagerProvider($container->get(EntityManager::class)),
$commands
);

View File

@ -20,7 +20,10 @@
"symfony/dom-crawler": "^6.3",
"symfony/css-selector": "^6.3",
"symfony/http-client": "^6.3",
"league/uri": "^6.8"
"league/uri": "^6.8",
"ext-simplexml": "*",
"ext-curl": "*",
"ext-dom": "*"
},
"require-dev": {
"phpunit/phpunit": "^10.0"

View File

@ -26,7 +26,7 @@ class FeedController
return $this->view->render($response, 'index.twig.html', ['feeds' => $feeds]);
}
public function create(ServerRequestInterface $request, ResponseInterface $response)
public function create(ServerRequestInterface $request, ResponseInterface $response): ResponseInterface
{
$this->logger->info("FeedController::create() called");
$body = $request->getParsedBody();

View File

@ -4,9 +4,16 @@ declare(strict_types=1);
namespace Lewisdale\App\Models\Data;
use Doctrine\Common\Collections\Collection;
use Doctrine\DBAL\Types\Types;
use Doctrine\ORM\Mapping as ORM;
use DOMXPath;
use Lewisdale\App\Models\Types\XMLElement;
use Lewisdale\App\Requests\Robots;
use Ramsey\Uuid\Doctrine\UuidGenerator;
use Ramsey\Uuid\UuidInterface;
use SimpleXMLElement;
use function Symfony\Component\String\s;
#[ORM\Entity]
#[ORM\Table(name: 'feeds')]
@ -24,4 +31,58 @@ class Feed
#[ORM\Column(type: 'string')]
public string $title;
/**
* @var Collection<FeedFilter>
*/
#[ORM\OneToMany(mappedBy: 'feed', targetEntity: FeedFilter::class, cascade: ['persist', 'remove'])]
public Collection $feedFilters;
#[ORM\Column(type: XMLElement::NAME, nullable: true)]
public SimpleXMLElement|null $remoteFeed;
#[ORM\Column(type: XMLElement::NAME, nullable: true)]
public SimpleXMLElement|null $filteredFeed;
public function fetch(): void
{
if (Robots::allowed($this->url)) {
$this->remoteFeed = simplexml_load_file($this->url);
} else {
throw new \Exception("Robots.txt disallows fetching this feed");
}
}
private function test_item(\DOMNode $item): bool {
foreach ($this->feedFilters as $filter) {
if (!$filter->execute(simplexml_import_dom($item))) {
return false;
}
}
return true;
}
public function filter() : void
{
if ($this->remoteFeed === null) {
$this->fetch();
}
$dom = dom_import_simplexml($this->remoteFeed);
$doc = $dom->ownerDocument;
$xpath = new DOMXPath($doc);
$filter_queries = implode("|", array_map(fn($filter) => $filter->to_xpath(), $this->feedFilters->toArray()));
$xpath_query = '//item[' . $filter_queries . ']';
$to_remove = $xpath->query($xpath_query);
echo "Removing " . $to_remove->length . " items\n";
foreach ($to_remove as $item) {
$item->parentNode->removeChild($item);
}
$this->filteredFeed = simplexml_import_dom($dom);
}
}

View File

@ -13,6 +13,24 @@ use Ramsey\Uuid\UuidInterface;
#[ORM\HasLifecycleCallbacks]
class FeedFilter
{
public function __construct(
FilterTarget $target,
FilterType $filter,
string $value,
Feed $feed,
UuidInterface $id = null,
)
{
if ($id !== null) {
$this->id = $id;
}
$this->target = $target;
$this->filter = $filter;
$this->value = $value;
$this->feed = $feed;
}
#[ORM\Id]
#[ORM\Column(type: "uuid", unique: true)]
#[ORM\GeneratedValue(strategy: "CUSTOM")]
@ -33,7 +51,30 @@ class FeedFilter
#[ORM\Column(type: 'string')]
public string $value;
#[ORM\ManyToOne(targetEntity: Feed::class)]
#[ORM\ManyToOne(targetEntity: Feed::class, cascade: ['persist', 'remove'], inversedBy: 'feedFilters')]
#[ORM\JoinColumn(name: 'feed_id', referencedColumnName: 'id')]
public Feed $feed;
public function to_xpath(): string {
return match ($this->filter) {
FilterType::EXACT => "{$this->target->value}='{$this->value}'",
FilterType::INCLUDE => "{$this->target->value}[not(contains(text(), \"{$this->value}\"))]",
FilterType::EXCLUDE => "{$this->target->value}[contains(text(), \"{$this->value}\")]",
};
}
public function execute(\SimpleXMLElement $item): bool {
$value = (string) match ($this->target) {
FilterTarget::TITLE => $item->title,
FilterTarget::DESCRIPTION => $item->description,
FilterTarget::LINK => $item->link,
};
return match ($this->filter) {
FilterType::INCLUDE => str_contains(strtolower($value), strtolower($this->value)),
FilterType::EXACT => $value === $this->value,
FilterType::REGEX => preg_match($this->value, $value),
FilterType::EXCLUDE => !str_contains($value, $this->value),
};
}
}

View File

@ -0,0 +1,40 @@
<?php
declare(strict_types=1);
namespace Lewisdale\App\Models\Types;
use Doctrine\DBAL\Platforms\AbstractPlatform;
use Doctrine\DBAL\Types\Type;
class XMLElement extends Type
{
const NAME = 'xml_element';
public function getSQLDeclaration(array $column, AbstractPlatform $platform): string
{
return 'TEXT';
}
public function convertToPHPValue($value, AbstractPlatform $platform)
{
if ($value === null) {
return null;
}
return simplexml_load_string($value);
}
public function convertToDatabaseValue($value, AbstractPlatform $platform)
{
if ($value instanceof \SimpleXMLElement) {
return $value->asXML();
} else {
return $value;
}
}
public function getName(): string
{
return self::NAME;
}
}

89
src/Requests/Robots.php Normal file
View File

@ -0,0 +1,89 @@
<?php
declare(strict_types=1);
namespace Lewisdale\App\Requests;
class Robots
{
// Original PHP code by Chirp Internet: www.chirpinternet.eu
// Adapted to include 404 and Allow directive checking by Eric at LinkUp.com
// Please acknowledge use of this code by including this header.
public static function allowed(string $url, string | null $useragent = "Baleen"): bool
{
// parse url to retrieve host and path
$parsed = parse_url($url);
$agents = [preg_quote('*')];
if ($useragent) {
$agents[] = preg_quote($useragent, '/');
}
$agents = implode('|', $agents);
// location of robots.txt file, only pay attention to it if the server says it exists
if (function_exists('curl_init')) {
$handle = curl_init("http://{$parsed['host']}/robots.txt");
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
$response = curl_exec($handle);
$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
if (200 == $httpCode) {
$robots_txt = explode("\n", $response);
} else {
$robots_txt = FALSE;
}
curl_close($handle);
} else {
$robots_txt = @file("http://{$parsed['host']}/robots.txt");
}
// if there isn't a robots, then we're allowed in
if (empty($robots_txt)) {
return true;
}
$rules = [];
$rule_applies = FALSE;
foreach ($robots_txt as $line) {
// skip blank lines
if (!$line = trim($line)) continue;
// following rules only apply if User-agent matches $useragent or '*'
if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
$rule_applies = preg_match("/($agents)/i", $match[1]);
continue;
}
if ($rule_applies) {
list($type, $rule) = explode(':', $line, 2);
$type = trim(strtolower($type));
// add rules that apply to array for testing
$rules[] = [
'type' => $type,
'match' => preg_quote(trim($rule), '/'),
];
}
}
$isAllowed = TRUE;
$currentStrength = 0;
foreach ($rules as $rule) {
// check if page hits on a rule
if (preg_match("/^{$rule['match']}/", $parsed['path'])) {
// prefer longer (more specific) rules and Allow trumps Disallow if rules same length
$strength = strlen($rule['match']);
if ($currentStrength < $strength) {
$currentStrength = $strength;
$isAllowed = ("allow" == $rule['type']);
} elseif ($currentStrength == $strength && ("allow" == $rule['type'])) {
$currentStrength = $strength;
$isAllowed = TRUE;
}
}
}
return $isAllowed;
}
}

View File

@ -0,0 +1,79 @@
<?php
declare(strict_types=1);
namespace Lewisdale\App\Tools\Console;
use Symfony\Component\Console\{Attribute\AsCommand,
Command\Command,
Input\InputArgument,
Input\InputInterface,
Output\OutputInterface};
use Doctrine\Common\Collections\ArrayCollection;
use Doctrine\ORM\EntityManager;
use Lewisdale\App\Models\Data\Feed;
use Lewisdale\App\Models\Data\FeedFilter;
use Lewisdale\App\Models\Data\FilterTarget;
use Lewisdale\App\Models\Data\FilterType;
use Lewisdale\App\Models\Repositories\FeedRepository;
use function DI\add;
#[AsCommand(name: "test:feed")]
class TestFeed extends Command
{
public function __construct(private readonly EntityManager $em,
private readonly FeedRepository $feedRepository)
{
parent::__construct();
}
protected function configure(): void
{
$this
->addArgument("url", InputArgument::REQUIRED, "The URL of the feed to test")
->addArgument("title", InputArgument::REQUIRED, "The title of the feed to test");
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$output->writeln([
"Testing Feed",
"============",
]);
$url = $input->getArgument("url");
$title = $input->getArgument("title");
// $output->writeln("Setting up a new feed for: $url");
//
// $feed = new Feed();
// $feed->url = $url;
// $feed->title = $title;
// $feed->fetch();
// $feed->feedFilters = new ArrayCollection();
// $feed->feedFilters->add( new FeedFilter(FilterTarget::TITLE, FilterType::INCLUDE, "[No Ads]", $feed));
//
// $this->em->persist($feed);
// $this->em->flush();
//
// $cnt = $this->feedRepository->count([]);
// $output->writeln("Feed count: $cnt");
$saved_feed = $this->feedRepository->findOneBy(['url' => $url, 'title' => $title]);
$output->writeln("Feed title: " . $saved_feed->title);
$s2_filter = new FeedFilter(FilterTarget::TITLE, FilterType::EXCLUDE, "S2", $saved_feed);
$this->em->persist($s2_filter);
$this->em->flush();
$saved_feed = $this->feedRepository->findOneBy(['url' => $url, 'title' => $title]);
$saved_feed->filter();
$this->em->persist($saved_feed);
$this->em->flush();
$output->writeln("Filtered feed: " . $saved_feed->filteredFeed->asXML('feed_filtered.xml'));
return Command::SUCCESS;
}
}

View File

@ -4,6 +4,8 @@ use Lewisdale\App\Controllers\FeedController;
use Lewisdale\App\Controllers\SampleController;
use Slim\Views\TwigMiddleware;
ini_set('user_agent', 'Baleen/1.0 (https://lewisdale.dev)');
require_once __DIR__ . "/dependencies.php";
global $container;

View File

@ -6,6 +6,7 @@ use Doctrine\DBAL\Types\Type;
use Doctrine\ORM\EntityManager;
use Doctrine\ORM\ORMSetup;
use Lewisdale\App\Logging\FileLogger;
use Lewisdale\App\Models\Types\XMLElement;
use Lewisdale\App\TwigExtensions\CsrfExtension;
use Psr\Log\LoggerInterface;
use Ramsey\Uuid\Doctrine\UuidType;
@ -40,6 +41,7 @@ $container->set(LoggerInterface::class, $container->get(FileLogger::class));
$container->set(EntityManager::class, static function() {
Type::addType('uuid', UuidType::class);
Type::addType(XMLElement::NAME, XMLElement::class);
$config = ORMSetup::createAttributeMetadataConfiguration(
paths: array(__DIR__."/Models/Data"),
@ -51,6 +53,8 @@ $container->set(EntityManager::class, static function() {
'path' => __DIR__ . '/../' . getenv("SQLITE_DB_NAME"),
], $config);
$connection->executeQuery("PRAGMA foreign_keys = ON");
return new EntityManager($connection, $config);
});