Coverage for binette/main.py: 98%

186 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-14 14:36 +0000

1#!/usr/bin/env python 

2""" 

3Module : Main 

4Description : The main entry point for the program. 

5Copyright : (c) Jean Mainguy, 28 nov. 2022 

6License : GPL-3.0 

7Maintainer : Jean Mainguy 

8Portability : POSIX 

9""" 

10 

11import logging 

12import os 

13import sys 

14from pathlib import Path 

15from typing import Annotated 

16 

17import pyfastx 

18import typer 

19from rich.console import Console 

20from rich.logging import RichHandler 

21 

22import binette as binette_init 

23from binette import bin_manager, bin_quality, cds, contig_manager, diamond 

24from binette import io_manager as io 

25 

26logger = logging.getLogger(__name__) 

27err_console = Console(stderr=True) 

28 

29 

30def version_callback( 

31 value: bool, 

32 ctx: typer.Context, 

33): 

34 """Prints the version and exits if --version is passed.""" 

35 if ctx.resilient_parsing: 

36 return 

37 

38 if value: 

39 typer.echo(f"Binette {binette_init.__version__}") 

40 raise typer.Exit() 

41 

42 

43def setup_logging(verbose: bool = False, quiet: bool = False): 

44 """Sets up logging configuration based on verbosity flags.""" 

45 if quiet and verbose: 

46 raise typer.BadParameter("Cannot specify both --verbose and --quiet") 

47 

48 if quiet: 

49 lvl = logging.WARNING 

50 elif verbose: 

51 lvl = logging.DEBUG 

52 else: 

53 lvl = logging.INFO 

54 

55 # Set up logging 

56 logging.basicConfig( 

57 level=lvl, 

58 format="%(message)s", 

59 datefmt="[%X]", 

60 handlers=[RichHandler(console=err_console)], 

61 ) 

62 

63 # Only log startup messages if not in quiet mode 

64 if not quiet: 

65 logger.info("Program started") 

66 logger.info(f"Command line: {' '.join(sys.argv)}") 

67 

68 

69def verbose_callback( 

70 verbose: bool, 

71): 

72 """Sets the logging level to DEBUG if --verbose is passed.""" 

73 # This is a placeholder - actual setup happens in the main function 

74 return verbose 

75 

76 

77def quiet_callback( 

78 quiet: bool, 

79): 

80 """Sets the logging level to WARNING if --quiet is passed.""" 

81 # This is a placeholder - actual setup happens in the main function 

82 return quiet 

83 

84 

85def preprocess_args(): 

86 """ 

87 Typer doesn't support whitespace-separated multi-value options. 

88 

89 We preprocess the sysargv so that: 

90 - python3 app.py some_command --filters filter1 filter2 filter3 \ 

91 --environments env1 env2 env3 

92 

93 becomes: 

94 - python3 app.py some_command --filters filter1 --filters filter2 --filters filter3 --environments env1 --environments env2 --environments env3 

95 

96 """ 

97 

98 logger.debug(f"Initial CLI command is: {sys.argv}") 

99 

100 # get main cmd 

101 final_cmd = [] 

102 for _, arg in enumerate(sys.argv): 

103 if any(arg.startswith(_) for _ in ["-", "--"]): 

104 break 

105 else: 

106 final_cmd.append(arg) 

107 logger.debug(f"Main command is: {final_cmd}") 

108 

109 # get options and their values 

110 for idx, arg in enumerate(sys.argv): 

111 if any(arg.startswith(_) for _ in ["-", "--"]): 

112 opt_values = [] 

113 for value in sys.argv[idx + 1 :]: 

114 if any(value.startswith(_) for _ in ["-", "--"]): 

115 break 

116 else: 

117 opt_values.append(value) 

118 

119 if len(opt_values) >= 1: 

120 [final_cmd.extend([arg, opt_value]) for opt_value in opt_values] 

121 else: 

122 final_cmd.append(arg) 

123 

124 # replace by reformatted 

125 logger.debug(f"Final command is: {final_cmd}") 

126 sys.argv = final_cmd 

127 

128 

129# Create the Typer app with no args help enabled and rich formatting 

130app = typer.Typer( 

131 name="binette", 

132 help=f"Binette: binning refinement tool to constructs high quality MAGs. Version: {binette_init.__version__}", 

133 add_completion=False, 

134 context_settings={"help_option_names": ["-h", "--help"]}, 

135 rich_markup_mode="rich", 

136) 

137 

138 

139def parse_input_files( 

140 bin_dirs: list[Path], 

141 contig2bin_tables: list[Path], 

142 contigs_fasta: Path, 

143 fasta_extensions: set[str] | None = None, 

144): 

145 """ 

146 Parses input files to retrieve information related to bins and contigs. 

147 

148 :param bin_dirs: List of paths to directories containing bin FASTA files. 

149 :param contig2bin_tables: List of paths to contig-to-bin tables. 

150 :param contigs_fasta: Path to the contigs FASTA file. 

151 :param temporary_dir: Path to the temporary directory to store intermediate files. 

152 :param fasta_extensions: Possible fasta extensions to look for in the bin directory. 

153 

154 :return: A tuple containing: 

155 - List of original bins. 

156 - Dictionary mapping bins to lists of contigs. 

157 - Dictionary mapping contig names to their lengths. 

158 """ 

159 if fasta_extensions is None: 

160 fasta_extensions = {".fasta", ".fa", ".fna"} 

161 

162 if bin_dirs: 

163 logger.info("Parsing bin directories") 

164 bin_name_to_bin_dir = io.infer_bin_set_names_from_input_paths(bin_dirs) 

165 bin_set_name_to_bins_info = bin_manager.parse_bin_directories( 

166 bin_name_to_bin_dir, fasta_extensions 

167 ) 

168 else: 

169 logger.info("Parsing bin2contig files") 

170 bin_name_to_bin_table = io.infer_bin_set_names_from_input_paths( 

171 contig2bin_tables 

172 ) 

173 bin_set_name_to_bins_info = bin_manager.parse_contig2bin_tables( 

174 bin_name_to_bin_table 

175 ) 

176 

177 logger.info(f"Processing {len(bin_set_name_to_bins_info)} bin sets") 

178 for bin_set_id, bins_info in bin_set_name_to_bins_info.items(): 

179 logger.info(f" {bin_set_id} - {len(bins_info)} bins") 

180 

181 contigs_in_bins = bin_manager.get_contigs_in_bin_sets(bin_set_name_to_bins_info) 

182 logger.info(f"Found {len(contigs_in_bins)} contigs in input bins") 

183 

184 contig_to_index = contig_manager.make_contig_index(contigs_in_bins) 

185 

186 contig_key_to_bin = bin_manager.make_bins_from_bins_info( 

187 bin_set_name_to_bins_info, contig_to_index, are_original_bins=True 

188 ) 

189 

190 # original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values()) 

191 

192 logger.info( 

193 f"Parsing contig fasta file '{contigs_fasta}' to retrieve contig lengths" 

194 ) 

195 

196 contigs_in_bins_set = set(contigs_in_bins) 

197 contig_to_length = { 

198 name: len(seq) 

199 for name, seq in pyfastx.Fastx(contigs_fasta.as_posix()) 

200 if name in contigs_in_bins_set 

201 } 

202 

203 logger.debug("Finished parsing contig fasta file") 

204 # check if all contigs from input bins are present in contigs file 

205 unexpected_contigs = { 

206 contig for contig in contigs_in_bins if contig not in contig_to_length 

207 } 

208 

209 if len(unexpected_contigs): 

210 raise ValueError( 

211 f"{len(unexpected_contigs)} contigs from the input bins were not " 

212 f"found in the contigs file '{contigs_fasta}'. " 

213 f"The missing contigs are: {', '.join(unexpected_contigs)}. " 

214 f"Please ensure all contigs from input bins are present in " 

215 f"contig file." 

216 ) 

217 logger.debug("No unexpected contigs found") 

218 

219 contig_id_to_length = { 

220 contig_to_index[name]: length for name, length in contig_to_length.items() 

221 } 

222 

223 return ( 

224 contig_key_to_bin, 

225 contigs_in_bins, 

226 contig_id_to_length, 

227 contig_to_index, 

228 ) 

229 

230 

231def manage_protein_alignement( 

232 faa_file: Path, 

233 contigs_fasta: Path, 

234 contigs_in_bins: set[str], 

235 diamond_result_file: Path, 

236 checkm2_db: Path | None, 

237 threads: int, 

238 use_existing_protein_file: bool, 

239 resume_diamond: bool, 

240 low_mem: bool, 

241) -> tuple[dict[str, int], dict[str, list[str]], dict[str, int | None] | None]: 

242 """ 

243 Predicts or reuses proteins prediction and runs diamond on them. 

244 

245 :param faa_file: The path to the .faa file. 

246 :param contigs_fasta: The path to the contigs FASTA file. 

247 :param contigs_in_bins: Dictionary mapping bin names to lists of contigs. 

248 :param diamond_result_file: The path to the diamond result file. 

249 :param checkm2_db: The path to the CheckM2 database. 

250 :param threads: Number of threads for parallel processing. 

251 :param use_existing_protein_file: Boolean indicating whether to use an existing protein file. 

252 :param resume_diamond: Boolean indicating whether to resume diamond alignement. 

253 :param low_mem: Boolean indicating whether to use low memory mode. 

254 

255 :return: A tuple containing dictionaries - contig_to_kegg_counter, contig_to_genes, and contig_to_coding_len. 

256 """ 

257 

258 # Predict or reuse proteins prediction and run diamond on them 

259 if use_existing_protein_file: 

260 logger.info(f"Parsing protein sequences from '{faa_file}'") 

261 contig_to_genes = cds.parse_faa_file(faa_file.as_posix()) 

262 io.check_contig_consistency( 

263 contigs_in_bins, 

264 contig_to_genes, 

265 contigs_fasta.as_posix(), 

266 faa_file.as_posix(), 

267 ) 

268 contig_to_coding_len = None 

269 logger.info( 

270 "Coding density will not be computed (using provided protein sequences)" 

271 ) 

272 

273 else: 

274 contigs_iterator = ( 

275 (name, seq) 

276 for name, seq in pyfastx.Fastx(contigs_fasta.as_posix()) 

277 if name in contigs_in_bins 

278 ) 

279 contig_to_genes, contig_to_coding_len = cds.predict( 

280 contigs_iterator, faa_file.as_posix(), threads 

281 ) 

282 logger.info("Coding density will be computed from freshly identified genes") 

283 

284 if not resume_diamond: 

285 if checkm2_db is None: 

286 # get checkm2 db stored in checkm2 install 

287 diamond_db_path = diamond.get_checkm2_db() 

288 elif checkm2_db.exists(): 

289 diamond_db_path = checkm2_db.as_posix() 

290 else: 

291 raise FileNotFoundError(checkm2_db) 

292 

293 diamond_log = ( 

294 diamond_result_file.parents[0] 

295 / f"{diamond_result_file.stem.split('.')[0]}.log" 

296 ) 

297 

298 diamond.run( 

299 faa_file.as_posix(), 

300 diamond_result_file.as_posix(), 

301 diamond_db_path, 

302 diamond_log.as_posix(), 

303 threads, 

304 low_mem=low_mem, 

305 ) 

306 

307 logger.info("Parsing diamond results") 

308 contig_to_kegg_counter = diamond.get_contig_to_kegg_id( 

309 diamond_result_file.as_posix() 

310 ) 

311 

312 # Check contigs from diamond vs input assembly consistency 

313 io.check_contig_consistency( 

314 contigs_in_bins, 

315 contig_to_kegg_counter, 

316 contigs_fasta.as_posix(), 

317 diamond_result_file.as_posix(), 

318 ) 

319 

320 return contig_to_kegg_counter, contig_to_genes, contig_to_coding_len 

321 

322 

323def log_selected_bin_info( 

324 selected_bins: list[bin_manager.Bin], 

325 hq_min_completeness: float, 

326 hq_max_conta: float, 

327): 

328 """ 

329 Log information about selected bins based on quality thresholds. 

330 

331 :param selected_bins: List of Bin objects to analyze. 

332 :param hq_min_completeness: Minimum completeness threshold for high-quality bins. 

333 :param hq_max_conta: Maximum contamination threshold for high-quality bins. 

334 

335 This function logs information about selected bins that meet specified quality thresholds. 

336 It counts the number of high-quality bins based on completeness and contamination values. 

337 """ 

338 

339 # Log completeness and contamination in debug log 

340 logger.debug("High quality bins:") 

341 for sb in selected_bins: 

342 if sb.is_high_quality( 

343 min_completeness=hq_min_completeness, max_contamination=hq_max_conta 

344 ): 

345 logger.debug( 

346 f" {sb} completeness={sb.completeness}, contamination={sb.contamination}" 

347 ) 

348 

349 # Count high-quality bins and single-contig high-quality bins 

350 hq_bins = len( 

351 [ 

352 sb 

353 for sb in selected_bins 

354 if sb.is_high_quality( 

355 min_completeness=hq_min_completeness, max_contamination=hq_max_conta 

356 ) 

357 ] 

358 ) 

359 

360 # Log information about high-quality bins 

361 thresholds = ( 

362 f"(completeness >= {hq_min_completeness} and contamination <= {hq_max_conta})" 

363 ) 

364 logger.info( 

365 f"{hq_bins}/{len(selected_bins)} selected bins have high quality {thresholds}" 

366 ) 

367 

368 

369@app.command( 

370 help=f"Binette {binette_init.__version__}: fast and accurate binning refinement tool to constructs high quality MAGs from the output of multiple binning tools.", 

371 no_args_is_help=True, 

372) 

373def binette( 

374 # Input arguments - Mutually exclusive group (handled in code) 

375 bin_dirs: Annotated[ 

376 list[Path] | None, 

377 typer.Option( 

378 "--bin_dirs", 

379 "-d", 

380 help="List of bin folders containing each bin in a fasta file.", 

381 # callback=lambda x: [is_valid_file(str(p)) for p in x] if x else None, 

382 exists=True, 

383 rich_help_panel="Input Arguments", 

384 ), 

385 ] = None, 

386 contig2bin_tables: Annotated[ 

387 list[Path] | None, 

388 typer.Option( 

389 "--contig2bin_tables", 

390 "-b", 

391 help="List of contig2bin tables with two columns: contig, bin.", 

392 exists=True, 

393 rich_help_panel="Input Arguments", 

394 ), 

395 ] = None, 

396 contigs: Annotated[ 

397 Path, 

398 typer.Option( 

399 "--contigs", 

400 "-c", 

401 help="Contigs in FASTA format.", 

402 exists=True, 

403 rich_help_panel="Input Arguments", 

404 ), 

405 ] = ..., # Required 

406 proteins: Annotated[ 

407 Path | None, 

408 typer.Option( 

409 "--proteins", 

410 "-p", 

411 help="FASTA file of predicted proteins in Prodigal format (>contigID_geneID). Skips the gene prediction step if provided.", 

412 exists=True, 

413 rich_help_panel="Input Arguments", 

414 ), 

415 ] = None, 

416 # Output & runtime control 

417 outdir: Annotated[ 

418 Path, 

419 typer.Option( 

420 "--outdir", 

421 "-o", 

422 help="Output directory.", 

423 rich_help_panel="Output and Runtime Control", 

424 ), 

425 ] = Path("results"), 

426 prefix: Annotated[ 

427 str, 

428 typer.Option( 

429 "--prefix", 

430 help="Prefix to add to final bin names (e.g. '--prefix sample1' will produce 'sample1_bin1.fa', 'sample1_bin2.fa').", 

431 rich_help_panel="Output and Runtime Control", 

432 ), 

433 ] = "binette", 

434 threads: Annotated[ 

435 int, 

436 typer.Option( 

437 "--threads", 

438 "-t", 

439 help="Number of threads to use.", 

440 rich_help_panel="Output and Runtime Control", 

441 ), 

442 ] = 1, 

443 verbose: Annotated[ 

444 bool, 

445 typer.Option( 

446 "--verbose", 

447 "-v", 

448 help="Enable verbose mode (show detailed debug information).", 

449 callback=verbose_callback, 

450 rich_help_panel="Output and Runtime Control", 

451 ), 

452 ] = False, 

453 quiet: Annotated[ 

454 bool, 

455 typer.Option( 

456 "--quiet", 

457 "-q", 

458 help="Enable quiet mode (only show warnings and errors).", 

459 callback=quiet_callback, 

460 rich_help_panel="Output and Runtime Control", 

461 ), 

462 ] = False, 

463 debug: Annotated[ 

464 bool, 

465 typer.Option( 

466 help="Activate debug mode.", 

467 rich_help_panel="Output and Runtime Control", 

468 ), 

469 ] = False, 

470 # Bin filtering & scoring 

471 min_completeness: Annotated[ 

472 int, 

473 typer.Option( 

474 "--min_completeness", 

475 help="Minimum completeness required for intermediate bin creation and final bin selection.", 

476 rich_help_panel="Bin Filtering and Scoring", 

477 ), 

478 ] = 40, 

479 max_contamination: Annotated[ 

480 int, 

481 typer.Option( 

482 "--max_contamination", 

483 help="Maximum contamination allowed for intermediate bin creation and final bin selection.", 

484 rich_help_panel="Bin Filtering and Scoring", 

485 ), 

486 ] = 10, 

487 min_length: Annotated[ 

488 int, 

489 typer.Option( 

490 "--min_length", 

491 help="Minimum length (bp) required for intermediate bin creation and final bin selection.", 

492 rich_help_panel="Bin Filtering and Scoring", 

493 ), 

494 ] = 200_000, 

495 max_length: Annotated[ 

496 int, 

497 typer.Option( 

498 "--max_length", 

499 help="Maximum length (bp) allowed for intermediate bin creation and final bin selection.", 

500 rich_help_panel="Bin Filtering and Scoring", 

501 ), 

502 ] = 10_000_000, 

503 contamination_weight: Annotated[ 

504 float, 

505 typer.Option( 

506 "--contamination_weight", 

507 "-w", 

508 help="Bins are scored as: completeness - weight * contamination. A lower weight favors completeness over low contamination.", 

509 rich_help_panel="Bin Filtering and Scoring", 

510 ), 

511 ] = 2.0, 

512 # Advanced options 

513 fasta_extensions: Annotated[ 

514 list[str], 

515 typer.Option( 

516 "--fasta_extensions", 

517 "-e", 

518 help="FASTA file extensions to search for in bin directories (used with --bin_dirs).", 

519 rich_help_panel="Advanced Options", 

520 ), 

521 ] = [ # noqa: B006 

522 ".fasta", 

523 ".fa", 

524 ".fna", 

525 ], 

526 checkm2_db: Annotated[ 

527 Path | None, 

528 typer.Option( 

529 "--checkm2_db", 

530 help="Path to CheckM2 diamond database. By default the database set via <checkm2 database> is used.", 

531 rich_help_panel="Advanced Options", 

532 ), 

533 ] = None, 

534 low_mem: Annotated[ 

535 bool, 

536 typer.Option( 

537 "--low_mem", 

538 help="Enable low-memory mode for Diamond.", 

539 rich_help_panel="Advanced Options", 

540 ), 

541 ] = False, 

542 resume: Annotated[ 

543 bool, 

544 typer.Option( 

545 help="Resume mode: reuse existing temporary files if possible.", 

546 rich_help_panel="Advanced Options", 

547 ), 

548 ] = False, 

549 version: Annotated[ 

550 bool, 

551 typer.Option( 

552 "--version", 

553 help="Show version and exit.", 

554 callback=version_callback, 

555 ), 

556 ] = None, 

557 progress: Annotated[ 

558 bool, 

559 typer.Option( 

560 help="Show progress bar while fetching pangenomes (disable with --no-progress).", 

561 rich_help_panel="Output and Runtime Control", 

562 ), 

563 ] = True, 

564 write_fasta_bins: Annotated[ 

565 bool, 

566 typer.Option( 

567 help="Write final selected bins as FASTA files (disable with --no-write-fasta-bins).", 

568 rich_help_panel="Output and Runtime Control", 

569 ), 

570 ] = True, 

571) -> int: 

572 """Orchestrate the execution of the program""" 

573 

574 # Set up logging based on verbosity flags 

575 setup_logging(verbose=verbose, quiet=quiet) 

576 

577 # Validate that exactly one of bin_dirs or contig2bin_tables is provided 

578 if bin_dirs is None and contig2bin_tables is None: 

579 raise typer.BadParameter( 

580 "Error: Either --bin-dirs or --contig2bin_tables must be provided. None were given." 

581 ) 

582 

583 if bin_dirs is not None and contig2bin_tables is not None: 

584 raise typer.BadParameter( 

585 "Error: Either --bin-dirs or --contig2bin_tables must be provided, but not both." 

586 ) 

587 

588 # High quality threshold used just to log number of high quality bins. 

589 hq_max_conta = 5 

590 hq_min_completeness = 90 

591 

592 # Temporary files # 

593 out_tmp_dir: Path = outdir / "temporary_files" 

594 os.makedirs(out_tmp_dir, exist_ok=True) 

595 

596 use_existing_protein_file = False 

597 

598 faa_file = out_tmp_dir / "assembly_proteins.faa.gz" 

599 

600 diamond_result_file = out_tmp_dir / "diamond_result.tsv.gz" 

601 

602 # Output files # 

603 final_bin_report: Path = outdir / "final_bins_quality_reports.tsv" 

604 original_bin_report_dir: Path = outdir / "input_bins_quality_reports" 

605 

606 if resume: 

607 io.check_resume_file(faa_file, diamond_result_file) 

608 use_existing_protein_file = True 

609 

610 ( 

611 contig_key_to_original_bin, 

612 contigs_in_bins, 

613 contig_to_length, 

614 contig_to_index, 

615 ) = parse_input_files( 

616 bin_dirs, 

617 contig2bin_tables, 

618 contigs, 

619 fasta_extensions=set(fasta_extensions), 

620 ) 

621 

622 if debug: 

623 index_to_contig_file = outdir / "index_to_contig.tsv" 

624 logger.info(f"Writing index to contig mapping to '{index_to_contig_file}'") 

625 with open(index_to_contig_file, "w") as flout: 

626 flout.write("\n".join((f"{i}\t{c}" for i, c in enumerate(contigs_in_bins)))) 

627 

628 if proteins and not resume: 

629 logger.info(f"Using the provided protein sequences file '{proteins}'") 

630 use_existing_protein_file = True 

631 

632 cds.filter_faa_file( 

633 contigs_in_bins, 

634 input_faa_file=proteins, 

635 filtered_faa_file=faa_file, 

636 ) 

637 

638 contig_name_to_kegg_counter, contig_name_to_genes, contig_to_coding_length = ( 

639 manage_protein_alignement( 

640 faa_file=faa_file, 

641 contigs_fasta=contigs, 

642 contigs_in_bins=contigs_in_bins, 

643 diamond_result_file=diamond_result_file, 

644 checkm2_db=checkm2_db, 

645 threads=threads, 

646 use_existing_protein_file=use_existing_protein_file, 

647 resume_diamond=resume, 

648 low_mem=low_mem, 

649 ) 

650 ) 

651 

652 contig_to_kegg_counter = contig_manager.apply_contig_index( 

653 contig_to_index, contig_name_to_kegg_counter 

654 ) 

655 contig_to_genes = contig_manager.apply_contig_index( 

656 contig_to_index, contig_name_to_genes 

657 ) 

658 if contig_to_coding_length: 

659 contig_to_coding_length = contig_manager.apply_contig_index( 

660 contig_to_index, contig_to_coding_length 

661 ) 

662 # Extract cds metadata ## 

663 logger.info("Computing CDS metadata") 

664 contig_metadat = cds.get_contig_cds_metadata(contig_to_genes, threads) 

665 

666 contig_metadat["contig_to_kegg_counter"] = contig_to_kegg_counter 

667 contig_metadat["contig_to_length"] = contig_to_length 

668 

669 logger.info("Adding size and assessing quality of input bins") 

670 original_bins = bin_quality.add_bin_metrics( 

671 list(contig_key_to_original_bin.values()), 

672 contig_metadat, 

673 contamination_weight, 

674 threads, 

675 disable_progress_bar=not progress or quiet, 

676 ) 

677 contig_key_to_original_bin = {b.contigs_key: b for b in original_bins} 

678 

679 bin_quality.add_bin_size_and_N50(original_bins, contig_to_length) 

680 

681 if contig_to_coding_length: 

682 bin_quality.add_bin_coding_density(original_bins, contig_to_coding_length) 

683 

684 logger.info( 

685 f"Writing original input bin metrics to directory '{original_bin_report_dir}'" 

686 ) 

687 io.write_original_bin_metrics(original_bins, original_bin_report_dir) 

688 

689 logger.info("Creating intermediate bins") 

690 

691 contig_lengths = bin_quality.prepare_contig_sizes(contig_to_length) 

692 

693 contig_key_to_new_bin = bin_manager.create_intermediate_bins( 

694 contig_key_to_original_bin, 

695 contig_lengths=contig_lengths, 

696 min_comp=min_completeness, 

697 max_conta=max_contamination, 

698 min_len=min_length, 

699 max_len=max_length, 

700 disable_progress_bar=not progress or quiet, 

701 ) 

702 

703 logger.info(f"Assessing quality for {len(contig_key_to_new_bin)} intermediate bins") 

704 

705 new_bins = bin_quality.add_bin_metrics( 

706 bins=contig_key_to_new_bin.values(), 

707 contig_info=contig_metadat, 

708 contamination_weight=contamination_weight, 

709 threads=threads, 

710 disable_progress_bar=not progress or quiet, 

711 ) 

712 contig_key_to_new_bin = {b.contigs_key: b for b in new_bins} 

713 

714 contig_key_to_all_bin = contig_key_to_original_bin | contig_key_to_new_bin 

715 

716 bin_quality.add_bin_size_and_N50(contig_key_to_all_bin.values(), contig_to_length) 

717 

718 if debug: 

719 all_bin_compo_file = outdir / "all_bins_quality_reports.tsv" 

720 logger.info(f"Writing all bins to '{all_bin_compo_file}'") 

721 io.write_bin_info( 

722 contig_key_to_all_bin.values(), all_bin_compo_file, add_contigs=True 

723 ) 

724 

725 selected_bins = bin_manager.select_best_bins( 

726 contig_key_to_all_bin, 

727 min_completeness=min_completeness, 

728 max_contamination=max_contamination, 

729 prefix=prefix, 

730 ) 

731 

732 if contig_to_coding_length: 

733 bin_quality.add_bin_coding_density(selected_bins, contig_to_coding_length) 

734 

735 logger.info(f"Writing selected bins information to '{final_bin_report}'") 

736 io.write_bin_info(selected_bins, output=final_bin_report) 

737 

738 io.write_contig2bin_table( 

739 selected_bins, 

740 outdir / "final_contig_to_bin.tsv", 

741 contigs_in_bins, 

742 ) 

743 

744 if write_fasta_bins: 

745 logger.info(f"Writing selected bins FASTA files to '{outdir / 'final_bins'}'") 

746 io.write_bins_fasta( 

747 selected_bins, 

748 contigs, 

749 outdir=outdir / "final_bins", 

750 contigs_names=contigs_in_bins, 

751 ) 

752 

753 log_selected_bin_info(selected_bins, hq_min_completeness, hq_max_conta) 

754 

755 return 0 

756 

757 

758def main(): 

759 preprocess_args() 

760 

761 app()