From 70e78bcf42d85630afb47feb0406dc4ee0054bbb Mon Sep 17 00:00:00 2001 From: Lewis Dale Date: Fri, 20 Sep 2024 13:22:05 +0100 Subject: [PATCH] Setup feeds, request, and filter them --- bin/doctrine | 9 ++- composer.json | 5 +- src/Controllers/FeedController.php | 2 +- src/Models/Data/Feed.php | 61 ++++++++++++++++++++ src/Models/Data/FeedFilter.php | 43 ++++++++++++++- src/Models/Types/XMLElement.php | 40 ++++++++++++++ src/Requests/Robots.php | 89 ++++++++++++++++++++++++++++++ src/Tools/Console/TestFeed.php | 79 ++++++++++++++++++++++++++ src/app.php | 2 + src/dependencies.php | 4 ++ 10 files changed, 330 insertions(+), 4 deletions(-) create mode 100644 src/Models/Types/XMLElement.php create mode 100644 src/Requests/Robots.php create mode 100644 src/Tools/Console/TestFeed.php diff --git a/bin/doctrine b/bin/doctrine index 20ceb0e..600436b 100644 --- a/bin/doctrine +++ b/bin/doctrine @@ -4,6 +4,8 @@ require_once __DIR__ . '/../vendor/autoload.php'; use Doctrine\ORM\EntityManager; use Doctrine\ORM\Tools\Console\ConsoleRunner; use Doctrine\ORM\Tools\Console\EntityManagerProvider\SingleManagerProvider; +use Lewisdale\App\Models\Repositories\FeedRepository; +use Lewisdale\App\Tools\Console\TestFeed; $dotenv = Dotenv\Dotenv::createImmutable([__DIR__, __DIR__ . "/.."]); $dotenv->load(); @@ -12,6 +14,11 @@ $dotenv->load(); require_once __DIR__ . '/../src/dependencies.php'; global $container; +$commands = [ + $container->get(TestFeed::class), +]; + ConsoleRunner::run( - new SingleManagerProvider($container->get(EntityManager::class)) + new SingleManagerProvider($container->get(EntityManager::class)), + $commands ); \ No newline at end of file diff --git a/composer.json b/composer.json index 39f699f..d77a07d 100644 --- a/composer.json +++ b/composer.json @@ -20,7 +20,10 @@ "symfony/dom-crawler": "^6.3", "symfony/css-selector": "^6.3", "symfony/http-client": "^6.3", - "league/uri": "^6.8" + "league/uri": "^6.8", + "ext-simplexml": "*", + "ext-curl": "*", + "ext-dom": "*" }, "require-dev": { "phpunit/phpunit": "^10.0" diff --git a/src/Controllers/FeedController.php b/src/Controllers/FeedController.php index 628d6e1..b711aab 100644 --- a/src/Controllers/FeedController.php +++ b/src/Controllers/FeedController.php @@ -26,7 +26,7 @@ class FeedController return $this->view->render($response, 'index.twig.html', ['feeds' => $feeds]); } - public function create(ServerRequestInterface $request, ResponseInterface $response) + public function create(ServerRequestInterface $request, ResponseInterface $response): ResponseInterface { $this->logger->info("FeedController::create() called"); $body = $request->getParsedBody(); diff --git a/src/Models/Data/Feed.php b/src/Models/Data/Feed.php index 356d1a4..11c5e72 100644 --- a/src/Models/Data/Feed.php +++ b/src/Models/Data/Feed.php @@ -4,9 +4,16 @@ declare(strict_types=1); namespace Lewisdale\App\Models\Data; +use Doctrine\Common\Collections\Collection; +use Doctrine\DBAL\Types\Types; use Doctrine\ORM\Mapping as ORM; +use DOMXPath; +use Lewisdale\App\Models\Types\XMLElement; +use Lewisdale\App\Requests\Robots; use Ramsey\Uuid\Doctrine\UuidGenerator; use Ramsey\Uuid\UuidInterface; +use SimpleXMLElement; +use function Symfony\Component\String\s; #[ORM\Entity] #[ORM\Table(name: 'feeds')] @@ -24,4 +31,58 @@ class Feed #[ORM\Column(type: 'string')] public string $title; + + + /** + * @var Collection + */ + #[ORM\OneToMany(mappedBy: 'feed', targetEntity: FeedFilter::class, cascade: ['persist', 'remove'])] + public Collection $feedFilters; + + #[ORM\Column(type: XMLElement::NAME, nullable: true)] + public SimpleXMLElement|null $remoteFeed; + + #[ORM\Column(type: XMLElement::NAME, nullable: true)] + public SimpleXMLElement|null $filteredFeed; + + public function fetch(): void + { + if (Robots::allowed($this->url)) { + $this->remoteFeed = simplexml_load_file($this->url); + } else { + throw new \Exception("Robots.txt disallows fetching this feed"); + } + } + + private function test_item(\DOMNode $item): bool { + foreach ($this->feedFilters as $filter) { + if (!$filter->execute(simplexml_import_dom($item))) { + return false; + } + } + return true; + } + + public function filter() : void + { + if ($this->remoteFeed === null) { + $this->fetch(); + } + + $dom = dom_import_simplexml($this->remoteFeed); + $doc = $dom->ownerDocument; + + $xpath = new DOMXPath($doc); + $filter_queries = implode("|", array_map(fn($filter) => $filter->to_xpath(), $this->feedFilters->toArray())); + + $xpath_query = '//item[' . $filter_queries . ']'; + + $to_remove = $xpath->query($xpath_query); + echo "Removing " . $to_remove->length . " items\n"; + foreach ($to_remove as $item) { + $item->parentNode->removeChild($item); + } + + $this->filteredFeed = simplexml_import_dom($dom); + } } \ No newline at end of file diff --git a/src/Models/Data/FeedFilter.php b/src/Models/Data/FeedFilter.php index 0d1f15a..13cdf0f 100644 --- a/src/Models/Data/FeedFilter.php +++ b/src/Models/Data/FeedFilter.php @@ -13,6 +13,24 @@ use Ramsey\Uuid\UuidInterface; #[ORM\HasLifecycleCallbacks] class FeedFilter { + public function __construct( + FilterTarget $target, + FilterType $filter, + string $value, + Feed $feed, + UuidInterface $id = null, + ) + { + if ($id !== null) { + $this->id = $id; + } + + $this->target = $target; + $this->filter = $filter; + $this->value = $value; + $this->feed = $feed; + } + #[ORM\Id] #[ORM\Column(type: "uuid", unique: true)] #[ORM\GeneratedValue(strategy: "CUSTOM")] @@ -33,7 +51,30 @@ class FeedFilter #[ORM\Column(type: 'string')] public string $value; - #[ORM\ManyToOne(targetEntity: Feed::class)] + #[ORM\ManyToOne(targetEntity: Feed::class, cascade: ['persist', 'remove'], inversedBy: 'feedFilters')] #[ORM\JoinColumn(name: 'feed_id', referencedColumnName: 'id')] public Feed $feed; + + public function to_xpath(): string { + return match ($this->filter) { + FilterType::EXACT => "{$this->target->value}='{$this->value}'", + FilterType::INCLUDE => "{$this->target->value}[not(contains(text(), \"{$this->value}\"))]", + FilterType::EXCLUDE => "{$this->target->value}[contains(text(), \"{$this->value}\")]", + }; + } + + public function execute(\SimpleXMLElement $item): bool { + $value = (string) match ($this->target) { + FilterTarget::TITLE => $item->title, + FilterTarget::DESCRIPTION => $item->description, + FilterTarget::LINK => $item->link, + }; + + return match ($this->filter) { + FilterType::INCLUDE => str_contains(strtolower($value), strtolower($this->value)), + FilterType::EXACT => $value === $this->value, + FilterType::REGEX => preg_match($this->value, $value), + FilterType::EXCLUDE => !str_contains($value, $this->value), + }; + } } \ No newline at end of file diff --git a/src/Models/Types/XMLElement.php b/src/Models/Types/XMLElement.php new file mode 100644 index 0000000..84a035b --- /dev/null +++ b/src/Models/Types/XMLElement.php @@ -0,0 +1,40 @@ +asXML(); + } else { + return $value; + } + } + + public function getName(): string + { + return self::NAME; + } +} \ No newline at end of file diff --git a/src/Requests/Robots.php b/src/Requests/Robots.php new file mode 100644 index 0000000..1fe020c --- /dev/null +++ b/src/Requests/Robots.php @@ -0,0 +1,89 @@ + $type, + 'match' => preg_quote(trim($rule), '/'), + ]; + } + + } + + $isAllowed = TRUE; + $currentStrength = 0; + + foreach ($rules as $rule) { + // check if page hits on a rule + if (preg_match("/^{$rule['match']}/", $parsed['path'])) { + // prefer longer (more specific) rules and Allow trumps Disallow if rules same length + $strength = strlen($rule['match']); + if ($currentStrength < $strength) { + $currentStrength = $strength; + $isAllowed = ("allow" == $rule['type']); + } elseif ($currentStrength == $strength && ("allow" == $rule['type'])) { + $currentStrength = $strength; + $isAllowed = TRUE; + } + } + } + + return $isAllowed; + } +} \ No newline at end of file diff --git a/src/Tools/Console/TestFeed.php b/src/Tools/Console/TestFeed.php new file mode 100644 index 0000000..717882f --- /dev/null +++ b/src/Tools/Console/TestFeed.php @@ -0,0 +1,79 @@ +addArgument("url", InputArgument::REQUIRED, "The URL of the feed to test") + ->addArgument("title", InputArgument::REQUIRED, "The title of the feed to test"); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $output->writeln([ + "Testing Feed", + "============", + ]); + + $url = $input->getArgument("url"); + $title = $input->getArgument("title"); + +// $output->writeln("Setting up a new feed for: $url"); +// +// $feed = new Feed(); +// $feed->url = $url; +// $feed->title = $title; +// $feed->fetch(); +// $feed->feedFilters = new ArrayCollection(); +// $feed->feedFilters->add( new FeedFilter(FilterTarget::TITLE, FilterType::INCLUDE, "[No Ads]", $feed)); +// +// $this->em->persist($feed); +// $this->em->flush(); +// +// $cnt = $this->feedRepository->count([]); +// $output->writeln("Feed count: $cnt"); + + $saved_feed = $this->feedRepository->findOneBy(['url' => $url, 'title' => $title]); + $output->writeln("Feed title: " . $saved_feed->title); + + $s2_filter = new FeedFilter(FilterTarget::TITLE, FilterType::EXCLUDE, "S2", $saved_feed); + $this->em->persist($s2_filter); + $this->em->flush(); + + $saved_feed = $this->feedRepository->findOneBy(['url' => $url, 'title' => $title]); + $saved_feed->filter(); + + $this->em->persist($saved_feed); + $this->em->flush(); + + $output->writeln("Filtered feed: " . $saved_feed->filteredFeed->asXML('feed_filtered.xml')); + + return Command::SUCCESS; + } +} \ No newline at end of file diff --git a/src/app.php b/src/app.php index c4cde4b..e4efe59 100644 --- a/src/app.php +++ b/src/app.php @@ -4,6 +4,8 @@ use Lewisdale\App\Controllers\FeedController; use Lewisdale\App\Controllers\SampleController; use Slim\Views\TwigMiddleware; +ini_set('user_agent', 'Baleen/1.0 (https://lewisdale.dev)'); + require_once __DIR__ . "/dependencies.php"; global $container; diff --git a/src/dependencies.php b/src/dependencies.php index bc28fe6..4950c53 100644 --- a/src/dependencies.php +++ b/src/dependencies.php @@ -6,6 +6,7 @@ use Doctrine\DBAL\Types\Type; use Doctrine\ORM\EntityManager; use Doctrine\ORM\ORMSetup; use Lewisdale\App\Logging\FileLogger; +use Lewisdale\App\Models\Types\XMLElement; use Lewisdale\App\TwigExtensions\CsrfExtension; use Psr\Log\LoggerInterface; use Ramsey\Uuid\Doctrine\UuidType; @@ -40,6 +41,7 @@ $container->set(LoggerInterface::class, $container->get(FileLogger::class)); $container->set(EntityManager::class, static function() { Type::addType('uuid', UuidType::class); + Type::addType(XMLElement::NAME, XMLElement::class); $config = ORMSetup::createAttributeMetadataConfiguration( paths: array(__DIR__."/Models/Data"), @@ -51,6 +53,8 @@ $container->set(EntityManager::class, static function() { 'path' => __DIR__ . '/../' . getenv("SQLITE_DB_NAME"), ], $config); + $connection->executeQuery("PRAGMA foreign_keys = ON"); + return new EntityManager($connection, $config); });